# Tweets Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import os
import re
from tqdm import tqdm

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'textcat'])

import candidate_extraction as cand_ex

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

#more advanced tokenizer gives freedom to adjust the way tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") # spell correction not used 
seg_eng = Segmenter(corpus="english") 

# preprocessor should remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [2]:
file_url = os.getcwd() + "/../../../../" + r"/Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_raw = event_url + r"Raw/"
event_url_clean = event_url + r"Clean/"

In [3]:
tigray_url_raw = event_url_raw + r"df_tigray.csv" # location of Tigray dataset
greece_url_raw = event_url_raw + r"df_greece.csv" # location of Greece dataset
rohingya_url_raw = event_url_raw + r"df_rohingya.csv" # location of Rohingya dataset
moria_url_raw = event_url_raw + r"df_moria.csv" # location of Moria dataset (for testing)

tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
moria_url_clean = event_url_clean + r"df_moria_clean.csv" # location of clean Moria dataset (for testing)

In [29]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(greece_url_clean)

loaded 137462 tweets!


## 1. Coherent sentences: Removing non-syntactic information

In [9]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''

    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    
    # we are using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet

In [10]:
tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['text_coherent'] = event_df['text'].progress_apply(clean_tweet)

  from pandas import Panel
100%|██████████| 175900/175900 [01:36<00:00, 1831.84it/s]


### Replace @username by screen name

In [11]:
users_url = file_url + "/df_users.csv"

# Read the users csv
print("loading users dataframe...")
df_users = pd.read_csv(users_url)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)
df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}


def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_coherent'] = event_df['text_coherent'].progress_apply(resolve_username_to_name)

loading users dataframe...


  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 175900/175900 [00:01<00:00, 102149.27it/s]


In [None]:
# this code runs for around 14h per 100k tweets
# event_df['event_corefs_resolved'] = cand_ex.replace_corefs(event_df['text_coherent'])

### Remove Duplicate Tweets

#### Exact Duplicates

In [None]:
#event_df[event_df["text_coherent"] == "I just signed a @theactionnet petition: Urgently resettle child refugees from Greek islands . . Sign here :"].sort_values("retweet_count")

In [12]:
event_df["text_coherent"].value_counts()

I just signed a @theactionnet petition: Urgently resettle child refugees from Greek islands . . Sign here :                                                                                                                                                                                   879
1 - Turkey has started a rightful operation against Syrian Regime to protect its borders after the vicious attacks by Syrian Regime against civilians in Turkey controlled zones . 2 - Turkey has opened its doors to Europe for 72 hours . All immigrants are free to pass!                  721
Hi EUHomeAffairs @Place_Beauvau Bundesministerium des Innern, für Bau und Heimat @foreignoffice @Justitiedep @ministerieJenV thousands of refugees are at risk of covid 19 on Greek islands due to crowded unsanitary conditions . Will you act now to leave no one behind and save lives?    479
Please . . We are Iraqi refugees in Turkey from 2014 to 2015 and so far we have not got a homeland . . Put yourselves in our place

In [13]:
## Group by tweet text
event_df_grouped = event_df[["text_coherent","retweet_count"]].groupby("text_coherent").agg({"retweet_count":["sum","count"]}).reset_index()
event_df_grouped.columns = list(map(''.join, event_df_grouped.columns.values))
event_df_grouped = event_df_grouped.rename(columns={"retweet_countsum":"retweet_count_sum","retweet_countcount":"count"})
event_df_grouped["retweet_count_sum"] = event_df_grouped["retweet_count_sum"] + event_df_grouped["count"] - 1 #take into account that only the retweets of a similar tweet but also the tweet iself is supposed to be treated as a retweet
event_df_grouped.head()

Unnamed: 0,text_coherent,retweet_count_sum,count
0,"! "" Trump plans to use COVID - 19 pandemic as ...",2,1
1,! Attention! To All Syrian Refugees: If you op...,0,1
2,! The public & the rest of the world isnt supp...,0,1
3,! To the attention of the Assad regime support...,1,1
4,!! Turkey claims one migrant was killed by Gre...,59,1


In [14]:
## Remove duplicate tweets
event_df_sorted = event_df.sort_values("created_at") #df should be sortedby default but this step ensures that sorting is there
event_df_no_dups = event_df_sorted.drop_duplicates("text_coherent", keep="first")

In [15]:
## Merge grouped data together
event_df_no_dups1 = pd.merge(left = event_df_no_dups,
                             right = event_df_grouped,
                             left_on = "text_coherent",
                             right_on = "text_coherent",
                             how = "inner")

In [16]:
# Check if numbers add up
event_df_no_dups1["count"].sum() == event_df.shape[0]

True

#### Fuzzy Duplicates

In [17]:
event_df = event_df_no_dups1.copy()

## 2. Alphanumeric text: Remove remaining special characters (apart from punctuation) and lowercase the text

In [31]:
tqdm.pandas()
event_df['text_alphanum'] = event_df['text_coherent'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))

100%|██████████████████████████████████████████████████████████████████████| 137462/137462 [00:01<00:00, 125079.82it/s]


## 3. STM

### Remove named entities

In [25]:
def remove_named_entities(tweet):
    
    doc = nlp(tweet)
    
    text_no_namedentities = []
    ents = [e.text for e in doc.ents]
    
    for item in doc:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    return " ".join(text_no_namedentities)

In [26]:
event_df["text_no_ne"] = event_df["text_coherent"].progress_apply(lambda x: remove_named_entities(x))

100%|██████████| 137462/137462 [30:48<00:00, 74.37it/s]


In [34]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jawo19ad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
event_df["text_stm"] = preprocessing(event_df["text_no_ne"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing tweets...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [36]:
#for raw, stm in zip(event_df["text"], event_df["text_stm"]):
#    print(raw)
#    print(stm)
#    print("---------")

In [37]:
event_df.head()

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,immigrant,asylum_seeker,other,date,text_coherent,retweet_count_sum,count,text_no_ne,text_alphanum,text_stm
0,Hootsuite Inc.,PA Ambassador in Bosnia &amp; Herzegovina says...,en,1227019550912372737,2020-02-11 00:00:32+00:00,81136269,1,0,1,0,...,False,False,False,2020-02-11,PA Ambassador in Bosnia & Herzegovina says lif...,1,1,PA Ambassador in Bosnia & Herzegovina says lif...,pa ambassador in bosnia herzegovina says life...,ambassador bosnia herzegovina say life fulfil ...
1,u.fooo.ooo,[🔴 NEWS] Greece plans floating sea border wall...,en,1227019556167864321,2020-02-11 00:00:33+00:00,1052191553802854407,0,0,0,0,...,False,False,False,2020-02-11,[ NEWS] Greece plans floating sea border wall ...,0,1,[ NEWS ] plans floating sea border wall to kee...,news greece plans floating sea border wall to...,news plan floating border wall keep
2,Twitter Web Client,Latest Battle for Idlib Could Send Another Wav...,en,1227021374780313601,2020-02-11 00:07:47+00:00,18570470,0,0,1,1,...,False,False,False,2020-02-11,Latest Battle for Idlib Could Send Another Wav...,8,5,Latest Battle for Idlib Could Send Another Wav...,latest battle for idlib could send another wav...,latest battle idlib send another wave europe w...
3,Tweepsmap,UNHCR calls for decisive action to end alarmin...,en,1227021789525614594,2020-02-11 00:09:26+00:00,62632306,0,0,0,0,...,False,False,False,2020-02-11,UNHCR calls for decisive action to end alarmin...,9,3,calls for decisive action to end alarming cond...,unhcr calls for decisive action to end alarmin...,call decisive action alarming condition island...
4,Twitter for iPhone,"It is not your own feet sinking in the mud, bo...",en,1227022233484308481,2020-02-11 00:11:12+00:00,2729959018,9,1,29,1,...,False,False,False,2020-02-11,"It is not your own feet sinking in the mud, boy.",9,1,"It is not your own feet sinking in the mud , b...",it is not your own feet sinking in the mud boy.,foot sinking


In [None]:
#greece_test = event_df[["id","text","text_stm","date","retweet_count"]]

In [None]:
#greece_test.to_csv('C:/Users/jawo19ad/Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_clean_stm.csv')

## Save the dataframe with clean text

In [None]:
event_df.shape

In [38]:
event_df.to_csv(greece_url_clean)

In [None]:
for i,line in event_df[["text","text_coherent","text_no_ne","text_stm"]].iterrows():
    print(line["text"])
    print("---")
    print(line["text_coherent"])
    print("---")
    print(line["text_no_ne"])
    print("---")
    print(line["text_stm"])
    print("-----------------------------------")

# Test for removing NEs

In [5]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [6]:
text_data = 'This is a text document that speaks about entities like Sweden and Nokia'

document = nlp(text_data)

text_no_namedentities = []

ents = [e.text for e in document.ents]
for item in document:
    if item.text in ents:
        pass
    else:
        text_no_namedentities.append(item.text)
print(" ".join(text_no_namedentities))

This is a text document that speaks about entities like and


In [7]:
document.ents

(Sweden, Nokia)

In [8]:
def remove_named_entities(tweet):
    
    doc = nlp(tweet)
    
    text_no_namedentities = []
    ents = [e.text for e in doc.ents]
    
    print(tweet)
    print(ents)
    
    for item in doc:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    return " ".join(text_no_namedentities)

In [9]:
_ = event_df.copy().head(100)

In [10]:
_["test"] = _["text_coherent"].apply(lambda x: remove_named_entities(x))

For rohingya Survivors in Bangladesh, Artwork Bears Witness Thanks to an initiative called " artolution " Muslim refugees from Myanmar are creating impressive murals that reflect their cultural traditions, their needs, their hopes, and their traumas.
['Survivors', 'Bangladesh', 'Muslim', 'Myanmar']
AstraZeneca dispels Indonesian Muslim concerns over COVID - 19 vaccine: . . . built by Bangladesh, where it has relocated nearly 14,000 Rohingya Muslim refugees since December last year despite criticism from rights groups.
['AstraZeneca', 'Indonesian Muslim', 'COVID', 'Bangladesh', 'nearly 14,000', 'Muslim', 'December last year']
I think u are one of the illegally migrant Rohingya from Bangladesh so better keep your mouth shut
['Bangladesh']
India seals Myanmar border amid strains over refugee crisis :
['India', 'Myanmar']
Fleeing coup, Myanmar police refugees in India seek asylum (from The Associated Press )
['Myanmar', 'India', 'The Associated Press']
The Hindu Explains | Why do the Centr

The way we are treating refugees from Myanmar - Rohingya, Chin or any other group - is truly shameful . If you didnt already know, at least now you should know that CAA wasnt about saving human lives; it was about Hindu chauvinism.
['Myanmar - Rohingya', 'Chin', 'Hindu']
The Rice Bags are fighting tooth & nail to get their rice bag brethren from Myanmar into India, but protested when the persecuted Hindus were given asylum in India (CAA). Mizoram isn' t a sovereign state, President' s rule must be imposed upon Mizoram
['Myanmar', 'India', 'Hindus', 'India', "Mizoram isn'", 'Mizoram']
Rohingya displaced people' s boarding the boats between December and April, when the seas are calm, to get to Malaysia, Mostly found themselves stranded at sea, when the ship crews who had promised to take them to Malaysia abandoned them at Bay of Bengal and the Andaman Sea.
['December', 'April', 'Malaysia', 'Malaysia', 'Bay of Bengal', 'the Andaman Sea']
Why do the Centre and the Mizoram government differ

Get to know how Myanmar refugee crisis is impacting the north - eastern states . What is 1951 UN convention on refugees?
['Myanmar', '1951', 'UN']
As Indian government maintains studious silence and deploys security forces on Mizoram border to check " illegal influx " from Myanmar and wants refugees already here deported, Mizoram CM Zoramthanga recognizes CRPH appointed Zin Mar Aung as Myanmar Foreign Minister . myanmarcoup
['Indian', 'Mizoram', 'Myanmar', 'Mizoram CM Zoramthanga', 'Zin Mar Aung', 'Myanmar', 'myanmarcoup']
No doubt . Great Contribution of Bihari for Bangladesh Liberation: Newspaper clips LNS was Hero and had contributed a lot for human rights when Red Cross Society & UNO had not helped Bangladeshi refugees on boarder area . Fist President Nazarul Kazi Ishalam letter Ambassador Letter
['LNS', 'Hero', 'Red Cross Society & UNO', 'Bangladeshi', 'Fist', 'Nazarul Kazi', 'Letter']
Myanmar refugees in India seek asylum
['Myanmar', 'India']
As myanmars Turmoil Gets Worse, Exter

In [11]:
_[["text","test"]]

Unnamed: 0,text,test
0,"For #Rohingya Survivors in Bangladesh, Artwork...","For rohingya in , Artwork Bears Witness Thanks..."
1,AstraZeneca dispels Indonesian Muslim concerns...,dispels Indonesian concerns over - 19 vaccine ...
2,@prabha_j @MehHarshil @derekobrienmp I think u...,I think u are one of the illegally migrant Roh...
3,India seals Myanmar border amid strains over r...,seals border amid strains over refugee crisis :
4,"Fleeing coup, Myanmar police refugees in India...","Fleeing coup , police refugees in seek asylum ..."
...,...,...
95,@3mpereira @Rohingya_ISCG @UNHCR_BGD @IOMBangl...,"He hired them and contracted with . Then , sus..."
96,Delhi is becoming crime capital due to these ...,is becoming crime capital due to these goons &...
97,Among the respondents who sent earnings back h...,Among the respondents who sent earnings back h...
98,Unveiling The “Un-heard”: Remembering Brutal B...,Unveiling The - heard : Remembering Brutal Ban...
