In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm

import candidate_extraction as cand_ex

import preprocessor
from ekphrasis.classes.tokenizer import Tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer 
from ekphrasis.classes.segmenter import Segmenter
#from ekphrasis.classes.spellcorrect import SpellCorrector

#more advanced tokenizer gives freedom to adjust the way tokens are split
social_pipeline = ["TAG", "EMAIL", "USER", "HASHTAG", "CASHTAG", "PHONE", "PERCENT", "NUMBER","WORD"]
tokenizer = Tokenizer(pipeline = social_pipeline, lowercase=False).tokenize
detokenizer = TreebankWordDetokenizer()

#spell_cor = SpellCorrector(corpus="english") # spell correction not used 
seg_eng = Segmenter(corpus="english") 

# preprocessor should remove emojis and urls in the tweets
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.EMOJI)

Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [2]:
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_tigray.csv" # location of Tigray dataset
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_greece.csv" # location of Greece dataset
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_rohingya.csv" # location of Rohingya dataset
moria_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_moria.csv" # location of Moria dataset (for testing)

def read_event_df(data_url):
    # easy dataframe load
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(greece_url)

loaded 175900 tweets!


## 1. Coherent sentences: Removing non-syntactic information

In [3]:
def clean_tweet(tweet):
    '''
    The goal of the function is to yield coherent sentences from raw tweets (without hashtags, URLs, emojis) 
    '''

    #remove emojis, links 
    tweet = preprocessor.clean(tweet)
    
    
    # we are using social tokenizer from ekphrasis due to potentially improper text structure
    tweet = tokenizer(tweet)
    
    #removing the irrelevant hashtags and mention using the heuristic that mentions in the beginning of the tweet 
    # and at least 2 consecutive hashtags at the end of the tweet carry no valuable information
    try:
        while tweet[0].startswith('@'):
            tweet.remove(tweet[0])

        if tweet[-1].startswith('@') and tweet[-2].startswith('@'):
            while tweet[-1].startswith('@'):
                tweet.remove(tweet[-1])

        if tweet[-1].startswith('#') and tweet[-2].startswith('#'):
            while tweet[-1].startswith('#'):
                tweet.remove(tweet[-1])
                
    except IndexError:
        pass


    #for hashtags that may carry information, we remove the # and split the word into more if applicable
    for word in range(len(tweet)):
        if tweet[word].startswith('#'):
            tweet[word] = tweet[word].replace('#','')
            tweet[word] = seg_eng.segment(tweet[word])

        # potentially correct spelling - but it is not working very well - corrects numbers to weird words
        #tweet[word] = spell_cor.correct(tweet[word])

    # instead of .join we use detokenizer in order to reconstruct the cleaned sentence in a better way
    #sample_df[twt] =  " ".join(tweet) 
    tweet = detokenizer.detokenize(tweet)
    
    
    #  tweets that end up being empty after preprocessing will cause problems when batching, replace empty tweet with 'no_tweet_text' which we can ignore later
    tweet = 'no_tweet_text' if len(tweet)==0 else tweet
    return tweet


tqdm.pandas() # allowing progress bar on .apply method (== .progress_apply)
event_df['text_coherent'] = event_df['text'].progress_apply(clean_tweet)

100%|████████████████████████████████████████████████████████████████████████| 175900/175900 [02:03<00:00, 1425.88it/s]


In [4]:
FILE_PATH = os.getcwd() + "/../../../Dropbox (CBS)/Master thesis data"
USERS_PATH = FILE_PATH + "/df_users.csv"
# Read the users csv

print("loading users dataframe...")
df_users = pd.read_csv(USERS_PATH)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)

df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}


def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_coherent'] = event_df['text_coherent'].progress_apply(resolve_username_to_name)

loading users dataframe...


100%|███████████████████████████████████████████████████████████████████████| 175900/175900 [00:02<00:00, 87252.14it/s]


In [None]:
# this code runs for around 14h per 100k tweets
event_df['event_corefs_resolved'] = cand_ex.replace_corefs(event_df['text_coherent'])

2021-05-26 21:17:08 INFO: Writing properties to tmp file: corenlp_server-c3904f7ba6a1489a.props
2021-05-26 21:17:08 INFO: Starting server with command: java -Xmx8G -cp C:\Users\nikodemicek\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 600000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-c3904f7ba6a1489a.props -annotators tokenize,ssplit,pos,parse,coref -preload -outputFormat serialized
  0%|                                                                             | 42/175900 [00:00<07:02, 415.86it/s]

preparing coreference chains...


100%|████████████████████████████████████████████████████████████████████████| 175900/175900 [02:06<00:00, 1394.58it/s]
  0%|                                                                                       | 0/175900 [00:00<?, ?it/s]

resolving coreference chains...


  0%|▏                                                                        | 398/175900 [08:10<173:13:07,  3.55s/it]

## 2. Alphanumeric text: Remove remaining special characters (apart from punctuation) and lowercase the text

In [None]:
#tqdm.pandas()
event_df['text_alphanum'] = event_df['text_clean'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9.!? ]+', '', tweet.lower()))


## 3. Keywords text - ???

## Save the dataframe with clean text

In [None]:
event_df[['id','date','text','text_coherent','text_alphanum']].to_csv('greece_df_clean.csv')