# Tweets Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import os
import re
from tqdm import tqdm

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'textcat'])

import candidate_extraction as cand_ex

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

#more advanced tokenizer gives freedom to adjust the way tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") # spell correction not used 
seg_eng = Segmenter(corpus="english") 

# preprocessor should remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [7]:
file_url = os.getcwd() + "/../../../../" + r"/Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_raw = event_url + r"Raw/"
event_url_clean = event_url + r"Clean/"

In [30]:
tigray_url_raw = event_url_raw + r"df_tigray.csv" # location of Tigray dataset
greece_url_raw = event_url_raw + r"df_greece.csv" # location of Greece dataset
rohingya_url_raw = event_url_raw + r"df_rohingya.csv" # location of Rohingya dataset
channel_url_raw = event_url_raw + r"df_channel.csv" # Location of channel dataset
moria_url_raw = event_url_raw + r"df_moria.csv" # location of Moria dataset (for testing)

tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
channel_url_clean = event_url_clean +r"df_channel_clean.csv" #Location of clean Channel dataset
moria_url_clean = event_url_clean + r"df_moria_clean.csv" # location of clean Moria dataset (for testing)

In [None]:
users_url = file_url + "/df_users.csv"

# Read the users csv
print("loading users dataframe...")
df_users = pd.read_csv(users_url)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)
df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}

In [77]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(rohingya_url_raw)

loaded 125791 tweets!


## 1. Coherent sentences: Removing non-syntactic information

In [78]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''

    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    
    # we are using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet

In [79]:
tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['text_coherent'] = event_df['text'].progress_apply(clean_tweet)

100%|██████████| 125791/125791 [01:11<00:00, 1763.53it/s]


### Replace @username by screen name

In [80]:
def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_coherent'] = event_df['text_coherent'].progress_apply(resolve_username_to_name)

100%|██████████| 125791/125791 [00:00<00:00, 128664.14it/s]


In [81]:
# this code runs for around 14h per 100k tweets
# event_df['event_corefs_resolved'] = cand_ex.replace_corefs(event_df['text_coherent'])

### Remove Duplicate Tweets

#### Exact Duplicates

In [82]:
#event_df[event_df["text_coherent"] == "I just signed a @theactionnet petition: Urgently resettle child refugees from Greek islands . . Sign here :"].sort_values("retweet_count")

In [83]:
event_df["text_coherent"].value_counts()

Myanmar citizens in America sets up a day - long market for war refugees and CDM from myanmar, in Fort Worth, Texas.                                                                                                                                                                               3841
In Mutraw District, karen State, myanmar military airstrikes are in progress now . Pray for the safety of the thousands of displaced children and families in the area . whats happening in myanmar                                                                                                3512
Dear friends @MilkTeaTH_MTAT, please save our Karen Refugees . We are really appreciate for that you all support Myanmar People . GOD bless each you abundantly.                                                                                                                                   3452
Hopeless Rohingya students from refugee camp in Bangladesh have held a sticker campaign today to show rohingya s

In [84]:
## Group by tweet text
event_df_grouped = event_df[["text_coherent","retweet_count"]].groupby("text_coherent").agg({"retweet_count":["sum","count"]}).reset_index()
event_df_grouped.columns = list(map(''.join, event_df_grouped.columns.values))
event_df_grouped = event_df_grouped.rename(columns={"retweet_countsum":"retweet_count_sum","retweet_countcount":"count"})
event_df_grouped["retweet_count_sum"] = event_df_grouped["retweet_count_sum"] + event_df_grouped["count"] - 1 #take into account that only the retweets of a similar tweet but also the tweet iself is supposed to be treated as a retweet
event_df_grouped.head()

Unnamed: 0,text_coherent,retweet_count_sum,count
0,! Is Thailand Army in cahoot with myanmar mili...,0,1
1,! This WWD article offers a great overview of ...,1,1
2,!!!! This is alarming situation! whats happeni...,0,1
3,""" . . . another devastating blow to the rohing...",1,1
4,""" . . . requesting for her repatriation to Ban...",0,1


In [85]:
## Remove duplicate tweets
event_df_sorted = event_df.sort_values("created_at") #df should be sorted by default but this step ensures that sorting is there
event_df_no_dups = event_df_sorted.drop_duplicates("text_coherent", keep="first")

In [86]:
## Merge grouped data together
event_df_no_dups1 = pd.merge(left = event_df_no_dups,
                             right = event_df_grouped,
                             left_on = "text_coherent",
                             right_on = "text_coherent",
                             how = "inner")

In [87]:
# Check if numbers add up
event_df_no_dups1["count"].sum() == event_df.shape[0]

True

#### Fuzzy Duplicates

In [88]:
event_df = event_df_no_dups1.copy()

## 2. Alphanumeric text: Remove remaining special characters (apart from punctuation) and lowercase the text

In [89]:
tqdm.pandas()
event_df['text_alphanum'] = event_df['text_coherent'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))

100%|██████████| 22966/22966 [00:00<00:00, 108233.72it/s]


## 3. STM

### Remove named entities

In [90]:
def remove_named_entities(tweet):
    
    doc = nlp(tweet)
    
    text_no_namedentities = []
    ents = [e.text for e in doc.ents]
    
    for item in doc:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    return " ".join(text_no_namedentities)

In [91]:
#event_df["text_no_ne"] = event_df["text_coherent"].progress_apply(lambda x: remove_named_entities(x))

In [92]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
event_df["text_stm"] = preprocessing(event_df["text_coherent"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing tweets...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [94]:
#for raw, stm in zip(event_df["text"], event_df["text_stm"]):
#    print(raw)
#    print(stm)
#    print("---------")

In [95]:
event_df.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,refugee,migrant,immigrant,asylum_seeker,other,text_coherent,retweet_count_sum,count,text_alphanum,text_stm
0,Twitter Web App,"For #Rohingya Survivors in Bangladesh, Artwork...",en,1373792416126402560,2021-03-22 00:23:30+00:00,77844813,2,1,2,0,...,True,False,False,False,False,"For rohingya Survivors in Bangladesh, Artwork ...",2,1,for rohingya survivors in bangladesh artwork b...,rohingya survivor bangladesh artwork bear witn...
1,dlvr.it,AstraZeneca dispels Indonesian Muslim concerns...,en,1373800977778700288,2021-03-22 00:57:31+00:00,1898083759,1,0,0,0,...,True,False,False,False,False,AstraZeneca dispels Indonesian Muslim concerns...,1,1,astrazeneca dispels indonesian muslim concerns...,astrazeneca dispels indonesian muslim concern ...
2,Twitter for Android,@prabha_j @MehHarshil @derekobrienmp I think u...,en,1373802051524730880,2021-03-22 01:01:47+00:00,1209116380257112064,0,0,1,0,...,False,True,False,False,False,I think u are one of the illegally migrant Roh...,0,1,i think u are one of the illegally migrant roh...,think illegally rohingya bangladesh better kee...
3,Twitter for Android,India seals Myanmar border amid strains over r...,en,1373802536579174401,2021-03-22 01:03:43+00:00,1032998054297780224,0,0,0,0,...,True,False,False,False,False,India seals Myanmar border amid strains over r...,0,1,india seals myanmar border amid strains over r...,india seal myanmar border amid strain crisis
4,Twitter for iPhone,"Fleeing coup, Myanmar police refugees in India...",en,1373804367757807619,2021-03-22 01:10:59+00:00,15552861,1,0,1,0,...,True,False,False,False,False,"Fleeing coup, Myanmar police refugees in India...",1,1,fleeing coup myanmar police refugees in india ...,fleeing coup myanmar police india seek asylum ...


In [96]:
event_df.shape

(22966, 30)

In [97]:
for i,line in event_df[["text","text_coherent","text_stm"]].sample(25).iterrows():
    print(line["text"])
    print("---")
    print(line["text_coherent"])
    print("---")
    print(line["text_stm"])
    print("-----------------------------------")

Today, responding to the impact of the devastating fires in Cox’s Bazar refugee camps in Bangladesh, we provided € 500,000 in emergency funding. Our commitment to alleviate the suffering faced by the almost million Rohingya refugees living there is reaffirmed. https://t.co/9mE9Z55tLK
---
Today, responding to the impact of the devastating fires in Coxs Bazar refugee camps in Bangladesh, we provided 500,000 in emergency funding . Our commitment to alleviate the suffering faced by the almost million Rohingya refugees living there is reaffirmed.
---
today responding impact devastating fire cox bazar camp bangladesh provided emergency funding commitment alleviate suffering faced almost million rohingya living reaffirmed
-----------------------------------
Heartbreaking Photo: A Karen  kid  refugee could not eat peacefully and look up at the sky as he feared the JUNTA’s troops would attack with airstrikes.

#WhatsHappeningInMyanmar
#Apr4Coup https://t.co/842WyXtxiR
---
Heartbreaking Photo: A

## Save the dataframe with clean text

In [98]:
#event_df.to_csv(rohingya_url_clean)