# Tweets Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import os
import re
from tqdm import tqdm

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'textcat'])

import candidate_extraction as cand_ex

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

#more advanced tokenizer gives freedom to adjust the way tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") # spell correction not used 
seg_eng = Segmenter(corpus="english") 

# preprocessor should remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [2]:
file_url = os.getcwd() + "/../../../../" + r"/Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_raw = event_url + r"Raw/"
event_url_clean = event_url + r"Clean/"

In [3]:
tigray_url_raw = event_url_raw + r"df_tigray.csv" # location of Tigray dataset
greece_url_raw = event_url_raw + r"df_greece.csv" # location of Greece dataset
rohingya_url_raw = event_url_raw + r"df_rohingya.csv" # location of Rohingya dataset
channel_url_raw = event_url_raw + r"df_channel.csv" # Location of channel dataset

tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
channel_url_clean = event_url_clean +r"df_channel_clean.csv" #Location of clean Channel dataset

In [4]:
users_url = file_url + "/df_users.csv"

# Read the users csv
print("loading users dataframe...")
df_users = pd.read_csv(users_url)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)
df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}

loading users dataframe...


  interactivity=interactivity, compiler=compiler, result=result)


In [49]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(tigray_url_raw)

loaded 181715 tweets!


## 1. Coherent sentences: Removing non-syntactic information

In [50]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''

    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    
    # we are using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet

In [51]:
tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['text_coherent'] = event_df['text'].progress_apply(clean_tweet)

100%|██████████| 181715/181715 [01:49<00:00, 1659.12it/s]


### Replace @username by screen name

In [52]:
def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_coherent'] = event_df['text_coherent'].progress_apply(resolve_username_to_name)

100%|██████████| 181715/181715 [00:01<00:00, 95650.93it/s] 


In [53]:
# this code runs for around 14h per 100k tweets
# event_df['event_corefs_resolved'] = cand_ex.replace_corefs(event_df['text_coherent'])

### Remove Duplicate Tweets

#### Exact Duplicates

In [54]:
#event_df[event_df["text_coherent"] == "I just signed a @theactionnet petition: Urgently resettle child refugees from Greek islands . . Sign here :"].sort_values("retweet_count")

In [55]:
event_df["text_coherent"].value_counts()

Consistent reports of ethnic - targeted violence, killings, massive looting, rape, forceful returns of refugees (to Eritrea). Innocent refugees should not be reprimanded.                                                                                                                   2313
. @ethiotelecom has been weaponized against the people of Tigray . Unelected Abiy Ahmed Ali 🇪🇹 uses communication as a tool of war . Cease & desist all deals regarding infrastructure in Ethiopia until the tigray genocide investigations are complete . Vodacom                           1540
" Major Violations of International Law at Tigray Refugee Camps: U . N . " Over 90,000 Eritreans who found save - heavens in tigray are the target of isaias.                                                                                                                                1469
" More than 500,000 tigraya | ns have lost their homes . Almost 60,000 have sought refugee status in neighboring Sudan . "        

In [56]:
## Group by tweet text
event_df_grouped = event_df[["text_coherent","retweet_count"]].groupby("text_coherent").agg({"retweet_count":["sum","count"]}).reset_index()
event_df_grouped.columns = list(map(''.join, event_df_grouped.columns.values))
event_df_grouped = event_df_grouped.rename(columns={"retweet_countsum":"retweet_count_sum","retweet_countcount":"count"})
event_df_grouped["retweet_count_sum"] = event_df_grouped["retweet_count_sum"] + event_df_grouped["count"] - 1 #take into account that only the retweets of a similar tweet but also the tweet iself is supposed to be treated as a retweet
event_df_grouped.head()

Unnamed: 0,text_coherent,retweet_count_sum,count
0,! 1M + are displaced & become aid dependent in...,7,1
1,! Hasn' t America heard that for the last half...,0,1
2,! unityfor ethiopia will not happen with a blo...,1,1
3,!! Alert mekele city tigray is now under Eritr...,0,1
4,!! As a well - respected peace - making nation...,0,1


In [57]:
## Remove duplicate tweets
event_df_sorted = event_df.sort_values("created_at") #df should be sorted by default but this step ensures that sorting is there
event_df_no_dups = event_df_sorted.drop_duplicates("text_coherent", keep="first")

In [58]:
## Merge grouped data together
event_df_no_dups1 = pd.merge(left = event_df_no_dups,
                             right = event_df_grouped,
                             left_on = "text_coherent",
                             right_on = "text_coherent",
                             how = "inner")

In [59]:
# Check if numbers add up
event_df_no_dups1["count"].sum() == event_df.shape[0]

True

#### Fuzzy Duplicates

In [60]:
event_df = event_df_no_dups1.copy()

## 2. Alphanumeric text: Remove remaining special characters (apart from punctuation) and lowercase the text

In [61]:
tqdm.pandas()
event_df['text_alphanum'] = event_df['text_coherent'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))

100%|██████████| 42853/42853 [00:00<00:00, 89994.08it/s]


## 3. STM

### Remove named entities

In [62]:
def remove_named_entities(tweet):
    
    doc = nlp(tweet)
    
    text_no_namedentities = []
    ents = [e.text for e in doc.ents]
    
    for item in doc:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    return " ".join(text_no_namedentities)

In [63]:
#event_df["text_no_ne"] = event_df["text_coherent"].progress_apply(lambda x: remove_named_entities(x))

In [64]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [65]:
event_df["text_stm"] = preprocessing(event_df["text_coherent"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing tweets...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [66]:
#for raw, stm in zip(event_df["text"], event_df["text_stm"]):
#    print(raw)
#    print(stm)
#    print("---------")

In [67]:
event_df.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,refugee,migrant,immigrant,asylum_seeker,other,text_coherent,retweet_count_sum,count,text_alphanum,text_stm
0,Twitter Web App,@FilippoGrandi Thank you for your statement......,en,1349868943800897537,2021-01-15 00:00:10+00:00,1325094884680765445,1,0,1,0,...,True,False,False,False,False,Thank you for your statement . . . Now what AC...,1,1,thank you for your statement . . . now what ac...,thank statement action taken civilian tigray u...
1,Twitter for iPhone,Action Now\n#HumanitarianCorridor Now\n#HumanR...,en,1349868980102696963,2021-01-15 00:00:18+00:00,1296598454,1,0,1,1,...,True,False,False,False,False,Action Now humanitarian corridor Now,1,1,action now humanitarian corridor now,action humanitarian corridor
2,Twitter Web App,"""@AUC_MoussaFaki has disregarded and violated ...",en,1349869527773933569,2021-01-15 00:02:29+00:00,1331304034230292482,0,0,0,0,...,True,False,False,False,False,""" Moussa Faki Mahamat has disregarded and viol...",73,55,moussa faki mahamat has disregarded and viola...,moussa faki mahamat disregarded violated inter...
3,Twitter for iPhone,@Refugees @zeaxumawit @FilippoGrandi But @anto...,en,1349870752070311936,2021-01-15 00:07:21+00:00,1325975568882479104,0,0,0,0,...,True,False,False,False,False,But António Guterres said there are no Eritrea...,0,1,but antnio guterres said there are no eritrean...,guterres said eritrean troop ethiopia need ret...
4,Twitter Web App,We call for global solidarity to STOP killings...,en,1349871059835744256,2021-01-15 00:08:34+00:00,1327215334097608704,4,0,8,1,...,True,False,False,False,False,We call for global solidarity to STOP killings...,4,1,we call for global solidarity to stop killings...,call global solidarity stop killing abduction ...


In [68]:
event_df.shape

(42853, 30)

In [69]:
for i,line in event_df[["text","text_coherent","text_stm"]].sample(25).iterrows():
    print(line["text"])
    print("---")
    print(line["text_coherent"])
    print("---")
    print(line["text_stm"])
    print("-----------------------------------")

-10 Billion FCFA disbursed to boost CAMWATER productivity
-'Mali authorities use tear gas' at banned anti-France protest
-Tigray crisis: 'Thousands of Eritrean refugees need water'
-France rules out apology for Algeria colonial abuses
https://t.co/289RgBRamO
---
- 10 Billion FCFA disbursed to boost CAMWATER productivity -' Mali authorities use tear gas' at banned anti - France protest - Tigray crisis :' Thousands of Eritrean refugees need water' - France rules out apology for Algeria colonial abuses
---
billion fcfa disbursed boost camwater productivity mali authority tear banned anti france protest tigray crisis thousand eritrean need water france rule apology algeria colonial abuse
-----------------------------------
Eritrean refugees forcibly returned to Eritrea by Eritrea troops #EritreanTroopsOutOfTigray https://t.co/RgQJEqW6cs
---
Eritrean refugees forcibly returned to Eritrea by Eritrea troops eritrean troops out of tigray
---
eritrean forcibly returned eritrea eritrea troop eri

## Save the dataframe with clean text

In [70]:
#event_df.to_csv(tigray_url_clean)