# Tweets Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import os
import re
from tqdm import tqdm

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

# ekphrasis tokenizer gives more freedom to adjust the way the tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") #spell correction did not perform well 
seg_eng = Segmenter(corpus="english") 

# preprocessor setting to remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


## 1. Data loading
Loading the event dataframes

In [2]:
file_url = os.getcwd() + "/../../../../" + r"/Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_raw = event_url + r"Raw/"
event_url_clean = event_url + r"Clean/"

In [3]:
tigray_url_raw = event_url_raw + r"df_tigray.csv" # location of Tigray dataset
greece_url_raw = event_url_raw + r"df_greece.csv" # location of Greece dataset
rohingya_url_raw = event_url_raw + r"df_rohingya.csv" # location of Rohingya dataset
channel_url_raw = event_url_raw + r"df_channel.csv" # Location of channel dataset

tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
channel_url_clean = event_url_clean +r"df_channel_clean.csv" #Location of clean Channel dataset

In [4]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(greece_url_raw)

loaded 175900 tweets!


In [5]:
# loading the dataframe with users to replace the mentions with twitter names

users_url = file_url + "/df_users.csv"

# Read the users csv
print("loading users dataframe...")
df_users = pd.read_csv(users_url)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)
df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}

loading users dataframe...


  interactivity=interactivity, compiler=compiler, result=result)


## 2. Parsing corpus: Removing non-syntactic information to obtain more coherent sentences
The process consists of:
1. cleaning the tweet
2. mapping the mentions to twitter names
3. removing duplicate rows based on the clean tweets

In [6]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''
    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    # using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet

In [7]:
tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['parsing_corpus'] = event_df['text'].progress_apply(clean_tweet)

100%|████████████████████████████████████████████████████████████████████████| 175900/175900 [01:18<00:00, 2237.35it/s]


### Replace @username by screen name

In [8]:
def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['parsing_corpus'] = event_df['parsing_corpus'].progress_apply(resolve_username_to_name)

100%|██████████████████████████████████████████████████████████████████████| 175900/175900 [00:01<00:00, 114609.69it/s]


### Remove Duplicate Tweets

In [10]:
#we can see that the dataset contains a lot of duplicate tweets
event_df["parsing_corpus"].value_counts()[:5]

I just signed a @theactionnet petition: Urgently resettle child refugees from Greek islands . . Sign here:                                                                                                                                                                                    879
1 - Turkey has started a rightful operation against Syrian Regime to protect its borders after the vicious attacks by Syrian Regime against civilians in Turkey controlled zones . 2 - Turkey has opened its doors to Europe for 72 hours . All immigrants are free to pass!                  721
Hi EUHomeAffairs @Place_Beauvau Bundesministerium des Innern, für Bau und Heimat @foreignoffice @Justitiedep @ministerieJenV thousands of refugees are at risk of covid 19 on Greek islands due to crowded unsanitary conditions . Will you act now to leave no one behind and save lives?    479
Please . . We are Iraqi refugees in Turkey from 2014 to 2015 and so far we have not got a homeland . . Put yourselves in our place

In [11]:
## Grouping by tweet text, keep count of retweets
event_df_grouped = event_df[["parsing_corpus","retweet_count"]].groupby("parsing_corpus").agg({"retweet_count":["sum","count"]}).reset_index()
event_df_grouped.columns = list(map(''.join, event_df_grouped.columns.values))
event_df_grouped = event_df_grouped.rename(columns={"retweet_countsum":"retweet_count_sum","retweet_countcount":"count"})
event_df_grouped["retweet_count_sum"] = event_df_grouped["retweet_count_sum"] + event_df_grouped["count"] - 1 #take into account that only the retweets of a similar tweet but also the tweet iself is supposed to be treated as a retweet
event_df_grouped.head()

Unnamed: 0,parsing_corpus,retweet_count_sum,count
0,"! "" Trump plans to use COVID - 19 pandemic as ...",2,1
1,! Attention! To All Syrian Refugees: If you op...,0,1
2,! The public & the rest of the world isnt supp...,0,1
3,! To the attention of the Assad regime support...,1,1
4,!! Turkey claims one migrant was killed by Gre...,59,1


In [12]:
## Remove duplicate tweets
event_df_sorted = event_df.sort_values("created_at") #df should be sorted by default but this step ensures that sorting is there
event_df_no_dups = event_df_sorted.drop_duplicates("parsing_corpus", keep="first")

In [13]:
## Merge grouped data together
event_df_no_dups1 = pd.merge(left = event_df_no_dups,
                             right = event_df_grouped,
                             left_on = "parsing_corpus",
                             right_on = "parsing_corpus",
                             how = "inner")

In [14]:
# Check if numbers add up
event_df_no_dups1["count"].sum() == event_df.shape[0]

True

## 3. Frame identification corpus

In [15]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    #Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
"""stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])"""

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
event_df["frame_identification_corpus"] = preprocessing(event_df["parsing_corpus"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing tweets...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [17]:
event_df.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,year_month,year_calendar_week,refugee,migrant,immigrant,asylum_seeker,other,date,parsing_corpus,frame_identification_corpus
0,u.fooo.ooo,[🔴 NEWS] Greece plans floating sea border wall...,en,1227019556167864321,2020-02-11 00:00:33+00:00,1052191553802854407,0,0,0,0,...,2020_2,2020_06,True,False,False,False,False,2020-02-11,[NEWS] Greece plans floating sea border wall t...,news greece plan floating border wall keep ref...
1,Twitter for iPhone,"It is not your own feet sinking in the mud, bo...",en,1227022233484308481,2020-02-11 00:11:12+00:00,2729959018,9,1,29,1,...,2020_2,2020_06,False,False,False,False,False,2020-02-11,"It is not your own feet sinking in the mud, boy.",foot sinking
2,IFTTT,⚠️ ALL Meteoalarm Severe #Weather Warnings for...,en,1227027068132745217,2020-02-11 00:30:24+00:00,342772619,0,0,0,0,...,2020_2,2020_06,True,False,False,False,False,2020-02-11,ALL Meteoalarm Severe weather Warnings for eur...,meteoalarm severe weather warning europe
3,Kurdish News,Snowstorm kills 13 migrants crossing from #Ira...,en,1227027865771876354,2020-02-11 00:33:34+00:00,248248441,0,0,0,0,...,2020_2,2020_06,False,True,False,False,False,2020-02-11,Snowstorm kills 13 migrants crossing from iran...,snowstorm kill migrant crossing iran turkey in...
4,Twitter for iPhone,.#Erdogan “will need to choose 1 of 2 solution...,en,1227027924311855105,2020-02-11 00:33:48+00:00,254373781,6,0,6,0,...,2020_2,2020_06,False,False,False,False,False,2020-02-11,. erdogan will need to choose 1 of 2 solutions...,erdogan need choose solution syrian cross turk...


In [18]:
for i,line in event_df[["text","parsing_corpus","frame_identification_corpus"]].sample(5).iterrows():
    print(line["text"])
    print("---")
    print(line["parsing_corpus"])
    print("---")
    print(line["frame_identification_corpus"])
    print("-----------------------------------")

@guardiannews Turkey has opened its doors to Europe for 72 hours. All immigrants are free to pass!
---
Turkey has opened its doors to Europe for 72 hours . All immigrants are free to pass!
---
turkey opened door europe hour immigrant free pas
-----------------------------------
Europeans unite.... Greek border  under attack for weeks... thank you to all countries helping but we need more help!!! Not peaceful refugees but Islamists! https://t.co/LkavoQLFTr
---
Europeans unite . . . . Greek border under attack for weeks . . . thank you to all countries helping but we need more help!!! Not peaceful refugees but Islamists!
---
european unite greek border attack week thank country helping need help peaceful refugee islamist
-----------------------------------
@SteliosPetsas You just have to be a human, once Greece refugees of WWII got warm welcomed in Syria and today Greece welcome Syrian refugees with tear gass and bullets. Oh humanity
---
You just have to be a human, once Greece refugees 

## Save the dataframe with clean text

In [70]:
event_df.to_csv(greece_url_clean)