# Tweets Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import os
import re
from tqdm import tqdm

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

# ekphrasis tokenizer gives more freedom to adjust the way the tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") #spell correction did not perform well 
seg_eng = Segmenter(corpus="english") 

# preprocessor setting to remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. Data loading
Loading the event dataframes

In [8]:
file_url = os.getcwd() + "/../../../../" + r"Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_raw = event_url + r"Raw/"
event_url_clean = event_url + r"Clean/"

In [11]:
tigray_url_raw = event_url_raw + r"df_tigray.csv" # location of Tigray dataset
greece_url_raw = event_url_raw + r"df_greece.csv" # location of Greece dataset
rohingya_url_raw = event_url_raw + r"df_rohingya.csv" # location of Rohingya dataset
channel_url_raw = event_url_raw + r"df_channel.csv" # Location of channel dataset
afghan_url_raw = event_url_raw + r"df_afghanistan.csv" # Location of afghanistan dataset

tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
channel_url_clean = event_url_clean +r"df_channel_clean.csv" #Location of clean Channel dataset
afghan_url_clean = event_url_clean + r"df_afghanistan_clean.csv" # Location of afghanistan dataset


In [12]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(afghan_url_raw)

loaded 283643 tweets!


In [13]:
# loading the dataframe with users to replace the mentions with twitter names

users_url = file_url + "/df_users.csv"

# Read the users csv
print("loading users dataframe...")
df_users = pd.read_csv(users_url)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)
df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}

loading users dataframe...


  interactivity=interactivity, compiler=compiler, result=result)


## 2. Parsing corpus: Removing non-syntactic information to obtain more coherent sentences
The process consists of:
1. cleaning the tweet
2. mapping the mentions to twitter names
3. removing duplicate rows based on the clean tweets

In [14]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''
    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    # using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet

In [15]:
tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['parsing_corpus'] = event_df['text'].progress_apply(clean_tweet)

100%|████████████████████████████████████████████████████████████████████████| 283643/283643 [01:44<00:00, 2723.12it/s]


### Replace @username by screen name

In [16]:
def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['parsing_corpus'] = event_df['parsing_corpus'].progress_apply(resolve_username_to_name)

100%|██████████████████████████████████████████████████████████████████████| 283643/283643 [00:02<00:00, 118478.30it/s]


### Remove Duplicate Tweets

In [17]:
#we can see that the dataset contains a lot of duplicate tweets
event_df["parsing_corpus"].value_counts()[:5]

Priti Patel: Call on the UK government to resettle 20,000 Afghan refugees - Sign the Petition! via Change.org UK                                                                                                                             2477
please expedite Special Immigrant Visas and evacuate applicants and their families . Translators, aid workers, and more, are suffering - we must act IMMEDIATELY.                                                                            1288
I' ve emailed my MP to let them know we need new safe routes for Afghan refugees . safe routes save lives                                                                                                                                     704
Britain promised a better life for the people of Afghanistan now it must help them to escape from the Taliban takeover . refugees welcome Sign the petition:                                                                                  416
As Kabul fell to the Taliban, th

In [18]:
## Grouping by tweet text, keep count of retweets
event_df_grouped = event_df[["parsing_corpus","retweet_count"]].groupby("parsing_corpus").agg({"retweet_count":["sum","count"]}).reset_index()
event_df_grouped.columns = list(map(''.join, event_df_grouped.columns.values))
event_df_grouped = event_df_grouped.rename(columns={"retweet_countsum":"retweet_count_sum","retweet_countcount":"count"})
event_df_grouped["retweet_count_sum"] = event_df_grouped["retweet_count_sum"] + event_df_grouped["count"] - 1 #take into account that only the retweets of a similar tweet but also the tweet iself is supposed to be treated as a retweet
event_df_grouped.head()

Unnamed: 0,parsing_corpus,retweet_count_sum,count
0,"! Chartering direct evacuation flights, as the...",0,1
1,! Department of State President Biden The Whit...,4,1
2,"! My country is in chaos, thousand of innocent...",4,5
3,! Now you' ve inadvertantly put yourself in th...,0,1
4,!! Afghan Refugees (UNHRC) European Migrant Cr...,0,1


In [19]:
## Remove duplicate tweets
event_df_sorted = event_df.sort_values("created_at") #df should be sorted by default but this step ensures that sorting is there
event_df_no_dups = event_df_sorted.drop_duplicates("parsing_corpus", keep="first")

In [20]:
## Merge grouped data together
event_df_no_dups1 = pd.merge(left = event_df_no_dups,
                             right = event_df_grouped,
                             left_on = "parsing_corpus",
                             right_on = "parsing_corpus",
                             how = "inner")

In [21]:
# Check if numbers add up
event_df_no_dups1["count"].sum() == event_df.shape[0]

True

## 3. Frame identification corpus

In [22]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    #Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
"""stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])"""

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
event_df["frame_identification_corpus"] = preprocessing(event_df["parsing_corpus"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing tweets...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [24]:
event_df.head()

Unnamed: 0,id,source,created_at,lang,author_id,text,retweet_count,reply_count,like_count,quote_count,...,year_month,year_calendar_week,date,refugee,migrant,immigrant,asylum_seeker,other,parsing_corpus,frame_identification_corpus
0,1419447696558108674,Twitter Web App,2021-07-26 00:01:17+00:00,en,50626909,Biden authorizes up to $100M for Afghan refuge...,1,0,0,0,...,2021_7,2021_30,2021-07-26,True,False,False,False,False,Biden authorizes up to $100M for Afghan refugees,biden authorizes afghan refugee
1,1419449166904713220,Twitter for Android,2021-07-26 00:07:08+00:00,en,1372157932943503363,@faheem2430 @USAmbKabul @StateDeputySpox gandu...,0,0,0,0,...,2021_7,2021_30,2021-07-26,True,False,False,False,False,gandu afghani . . . . . maderchood u in refuge...,gandu afghani maderchood refugee camp iran int...
2,1419449634179534850,Twitter for iPhone,2021-07-26 00:08:59+00:00,en,159060632,Hazara refugees urge Australian government to ...,1,0,0,0,...,2021_7,2021_30,2021-07-26,True,False,False,False,False,Hazara refugees urge Australian government to ...,hazara refugee urge australian government help...
3,1419450730063224833,Twitter for Android,2021-07-26 00:13:20+00:00,en,1144443713994678273,"""US has a moral obligation to Afghan allies..c...",0,0,1,0,...,2021_7,2021_30,2021-07-26,False,False,False,False,False,""" US has a moral obligation to Afghan allies ....",moral obligation afghan ally conflict future c...
4,1419453607284510721,TweetDeck,2021-07-26 00:24:46+00:00,en,162114001,Fleeing fighting and hoping to head toward som...,2,1,8,0,...,2021_7,2021_30,2021-07-26,True,False,False,False,False,Fleeing fighting and hoping to head toward som...,fleeing fighting hoping head toward something ...


In [25]:
for i,line in event_df[["text","parsing_corpus","frame_identification_corpus"]].sample(5).iterrows():
    print(line["text"])
    print("---")
    print(line["parsing_corpus"])
    print("---")
    print(line["frame_identification_corpus"])
    print("-----------------------------------")

@tedcruz Thanks to the Ttump administration, Miller et al, they royally screwed up the visa process after releasing 5000 Taliban fighters. 
That was their plan all along. Screw the Allies.
https://t.co/2MLjwDy76L
---
Thanks to the Ttump administration, Miller et al, they royally screwed up the visa process after releasing 5000 Taliban fighters . That was their plan all along . Screw the Allies.
---
thanks ttump administration miller royally screwed visa process releasing taliban fighter plan along screw ally
-----------------------------------
@MrsOhSehun already accommodating 3 million afghan refugees with meager resources, apart from that my country is equally as miserable as yours.
---
already accommodating 3 million afghan refugees with meager resources, apart from that my country is equally as miserable as yours.
---
already accommodating million afghan refugee meager resource apart country equally miserable
-----------------------------------
Let’s get some facts straight. 

Yes 

## Save the dataframe with clean text

In [26]:
event_df.to_csv(afghan_url_clean)