# Candidate merging and related preprocessing


Import relevant packages for the following parts

In [3]:
#python libraries
import numpy as np
import pandas as pd
import os
import re
import gensim

import time
from tqdm import tqdm

from collections import Counter, defaultdict


# self written modules
import preprocessing


## 1. Import the data

In [4]:
tigray_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_tigray.csv" # location of Tigray dataset
greece_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_greece.csv" # location of Greece dataset
rohingya_url = r"Dropbox (CBS)/Master thesis data/Event Dataframes/df_rohingya.csv" # location of Rohingya dataset
all_url = r"Dropbox (CBS)/Master thesis data/df_tweets.csv" # for all tweets

def read_event_df(data_url):
    directory_path = os.getcwd() + "/../../../" + data_url 
    event_df = pd.read_csv(directory_path, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

# pick the df 
event_df = read_event_df(tigray_url)
#channel_df = read_event_df(channel_url)
tqdm.pandas()
event_df['text_clean'] = event_df['text'].progress_apply(preprocessing.preprocess_tweets)

  0%|▎                                                                           | 281/78450 [00:00<00:55, 1417.20it/s]

loaded 78450 tweets!


100%|██████████████████████████████████████████████████████████████████████████| 78450/78450 [00:51<00:00, 1532.01it/s]


In [5]:
FILE_PATH = os.getcwd() + "/../../Dropbox (CBS)/Master thesis data"
USERS_PATH = FILE_PATH + "/df_users.csv"
# Read the users csv

print("loading users dataframe...")
df_users = pd.read_csv(USERS_PATH)

# Drop unnecessary index column
df_users.drop("Unnamed: 0", axis=1, inplace=True)

df_users.head()

# Create dict that maps usernames to actual names
mapping = dict(df_users[["username","name"]].values)
mapping = {f'@{key}': value for key, value in mapping.items()}


def resolve_username_to_name(text):
    new_text = text
    for word in text.split(" "):
        if word in mapping:
            new_text = new_text.replace(word,mapping[word])
    return new_text

#tqdm.pandas()
event_df['text_clean'] = event_df['text_clean'].progress_apply(resolve_username_to_name)

loading users dataframe...


100%|█████████████████████████████████████████████████████████████████████████| 78450/78450 [00:01<00:00, 78449.53it/s]


## Train our own event-specific Word2Vec model

In [4]:
#fuzzy duplicate removal (removes 100% duplicates before expensive operations) 
#done on dataframe level, we want to keep the ID column to match later on

# the comparison will be done on lowercased texts consisting of only letters, digits and spaces
event_df['text_clean'] = event_df['text_clean'].progress_apply(lambda tweet:re.sub(r'[^A-Za-z0-9 ]+', '', tweet.lower()))
unique_tweets_df = preprocessing.fuzzy_duplicate_removal(event_df)
unique_tweets_df

100%|███████████████████████████████████████████████████████████████████████| 118696/118696 [00:01<00:00, 62818.61it/s]


Tweets at the start: 118696
Tweets after 100% duplicates removed: 91787
calculating similarities across documents...


3636it [00:00, 36018.51it/s]

Similarity calculation completed in 724.0583999156952 seconds
removing fuzzy duplicates...


167311it [00:04, 36867.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupl_removed['is_dup'][i] = True


81327 tweets left after 70.0% similar tweets (by cosine similarity) removed


Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,year,calendar_week,year_month,year_calendar_week,refugee,migrant,immigrant,asylum_seeker,other,is_dup
0,saperduper_robots,Migrants clash with Greek police as they exit ...,en,1233904804134555649,2020-03-01 00:00:04+00:00,108944513,3,0,1,1,...,2020,8,2020_3,2020_08,False,True,False,False,False,False
1,Loomly,"NATO urges Syria, Russia to halt airstrikes as...",en,1233904855774683136,2020-03-01 00:00:17+00:00,4717892303,12,5,14,3,...,2020,8,2020_3,2020_08,False,True,False,False,False,False
2,Twitter Web App,Good on Greece!!....Close all OUR Borders!!......,en,1233904867078393856,2020-03-01 00:00:19+00:00,1067006507189886976,2,1,4,0,...,2020,8,2020_3,2020_08,False,False,False,False,False,False
3,Hootsuite Inc.,Increased risk of #MaternalDeath in immigrants...,en,1233904894425419776,2020-03-01 00:00:26+00:00,3397107171,1,0,2,0,...,2020,8,2020_3,2020_08,False,False,True,False,False,False
4,Twitter for iPhone,@OhGodPlsNOO @ilknurdarendeli @AFP Europe has ...,en,1233904905108283393,2020-03-01 00:00:28+00:00,1188172981530435584,0,0,0,1,...,2020,8,2020_3,2020_08,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118688,Twitter for Android,@AngeloDDuca @GiuseppeConteIT @IslamSeries @Mi...,en,1241511809389137920,2020-03-21 23:47:36+00:00,738486583725305857,2,0,0,0,...,2020,11,2020_3,2020_11,True,False,False,False,False,False
118689,dlvr.it,Coronavirus risks taking heavy toll on migrant...,en,1241512682513002497,2020-03-21 23:51:04+00:00,2985565615,0,0,0,0,...,2020,11,2020_3,2020_11,False,True,False,False,False,False
118691,Twitter Web App,@ConanOBrien @TheEllenShow @IngrahamAngle @Nob...,en,1241513000147664896,2020-03-21 23:52:20+00:00,309530329,0,2,1,0,...,2020,11,2020_3,2020_11,False,False,False,False,False,False
118694,Tweepsmap,RT @WRRoute\n\nNatasha Dailiani calls upon med...,en,1241514169532321793,2020-03-21 23:56:58+00:00,62632306,3,0,1,0,...,2020,11,2020_3,2020_11,False,False,False,False,False,False


In [5]:
from gensim.models.phrases import Phrases,ENGLISH_CONNECTOR_WORDS

tweet_corpus_tokens = [tweet.split() for tweet in unique_tweets_df['text_clean']]
#tweet_corpus_tokens
bigram = Phrases(tweet_corpus_tokens, min_count=25, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) # higher threshold fewer phrases.
trigram = Phrases(bigram[tweet_corpus_tokens],min_count=25, threshold=10,connector_words=ENGLISH_CONNECTOR_WORDS) 


trigram.vocab

{'migrants_clash': 95,
 'greek_police': 1575,
 'migrants_clash_with_greek_police': 74,
 'as': 11266,
 'greek_police_as': 33,
 'they': 19847,
 'as_they': 467,
 'exit': 78,
 'they_exit': 2,
 'turkey': 34257,
 'exit_turkey': 6,
 'bbc_news': 372,
 'turkey_bbc_news': 6,
 'nato': 1526,
 'urges': 159,
 'nato_urges': 6,
 'syria': 10283,
 'urges_syria': 6,
 'russia': 1989,
 'syria_russia': 72,
 'halt': 102,
 'russia_to_halt': 5,
 'airstrikes': 38,
 'halt_airstrikes': 5,
 'airstrikes_as': 5,
 'migrants': 20502,
 'as_migrants': 398,
 'move': 706,
 'migrants_move': 12,
 'westward': 7,
 'move_westward': 6,
 'good': 1292,
 'greece': 29042,
 'good_on_greece': 5,
 'close': 696,
 'greece_close': 4,
 'all': 8366,
 'close_all': 17,
 'our': 4225,
 'all_our': 39,
 'borders': 7033,
 'our_borders': 403,
 'let': 1653,
 'borders_let': 7,
 'let_turkey': 44,
 'pm': 285,
 'turkey_and_pm': 1,
 'erdogen': 14,
 'pm_erdogen': 3,
 'take': 3336,
 'erdogen_take': 3,
 'take_all': 113,
 'these': 4820,
 'all_these': 371,
 

In [None]:
# OPTIONAL: use this only if training of ngram models is complete, it only serves purpose of saving memory
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [6]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords

model_phrases = [trigram[tweet] for tweet in tweet_corpus_tokens]
print(model_phrases[:10])

[['migrants_clash', 'with', 'greek_police', 'as', 'they', 'exit', 'turkey', 'bbc_news'], ['nato', 'urges', 'syria', 'russia', 'to', 'halt', 'airstrikes', 'as', 'migrants', 'move', 'westward'], ['good', 'on', 'greece', 'close', 'all', 'our', 'borders', 'let', 'turkey', 'and', 'pm', 'erdogen', 'take', 'all', 'these', 'islamic', 'refugees', 'there', 'are', 'a', 'lot', 'of', 'nice', 'safe', 'wealthy', 'islamic', 'places', 'they', 'can', 'go', 'to', 'make', 'them', 'take', 'them', 'not', 'greece', 'or', 'any', 'of', 'our', 'countries', 'please', 'retweet', 'this'], ['increased', 'risk', 'of', 'maternal', 'death', 'in', 'immigrants', 'in', 'europe', 'the', 'usa', 'risk', 'depends', 'on', 'maternal', 'birthplace', 'region', 'where', 'prenatal', 'checkups', 'delivery', 'took', 'place', 'other', 'maternal', 'characteristics', 'such_as', 'age'], ['europe', 'has', 'three', 'choices', 'first', 'take', 'refugees', 'in', 'second', 'help', 'turkey', 'to', 'create', 'a', 'buffer_zone', 'in', 'syria', 

In [None]:
# FINDING BEST PARAMETERS FOR WORD2VEC MODEL

#negatives = [5,10,20]
sizes = [100,200,300]
sgs=[0,1]
windows =[3,5] 
#cbow_means = [0,1]
#iters=[10]

for size in sizes:
        for window in windows:
            #print(f'\nfor params size={size},negative={neg},sg={sg},hs={hs},window={window},cbow_mean={cbow},iter={it}')
            print(f'\nfor params size={size},window={window}')
            model = Word2Vec(model_phrases,vector_size=size,window=window)
            print(model.wv.most_similar('refugees'))


In [17]:
import gensim.downloader as api
from gensim.models import Word2Vec

model = Word2Vec(model_phrases,vector_size=300,window=3,sg=0)
print('first model done')
#model2 = api.load("glove-twitter-200")
#print('second model done')
#model3 = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\nikodemicek\Dropbox (CBS)\Master thesis data\GoogleNews-vectors-negative300.bin.gz', binary=True)
#print('third model done')



first model done


In [None]:
model.wv.most_similar('migrant')

In [7]:
from gensim.models import FastText


model2 = FastText(vector_size=100, window=4, min_count=5, sentences=model_phrases, epochs=10)

In [9]:
model2.wv.most_similar('refugee')

[('refugess', 0.8553880453109741),
 ('refugee_boy', 0.8527049422264099),
 ('refuge', 0.8440694808959961),
 ('refugeesgr', 0.8257689476013184),
 ('refugee_flows', 0.8112236857414246),
 ('womenrefugeeroute', 0.8065913915634155),
 ('refugee_flow', 0.8042846918106079),
 ('largest_refugee', 0.8042380809783936),
 ('refugeesand', 0.8039513230323792),
 ('refugee_issue', 0.7974311113357544)]

In [11]:
model2.wv.similarity('refugee','migrant')

0.6360404

In [14]:
pickle_file('moria_fasttext_model',model2)

## 3. We instantiate stanza english language module

In [4]:
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ needed when running first time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#

#stanza.download("en")

#stanza.install_corenlp()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# loading the pipeline
en_nlp = stanza.Pipeline("en", tokenize_pretokenized=True, ner_batch_size=4096)

2021-05-17 14:52:56 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-05-17 14:52:56 INFO: Use device: cpu
2021-05-17 14:52:56 INFO: Loading: tokenize
2021-05-17 14:52:56 INFO: Loading: pos
2021-05-17 14:52:57 INFO: Loading: lemma
2021-05-17 14:52:57 INFO: Loading: depparse
2021-05-17 14:52:58 INFO: Loading: sentiment
2021-05-17 14:52:59 INFO: Loading: ner
2021-05-17 14:53:01 INFO: Done loading processors!


In [24]:
import pickle

def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

In [21]:
def load_event_data(event_name):
    assert event_name in ['moria','tigray','channel','all','beirut'], f"Oh no! We do not analyze {event_name} event"
    
    print(f'Loading {event_name} data...')
    try:
        #sample = 2000
        event_np_list = load_pickle(event_name + '_np_list')#[1000:sample]
        event_crf_list = load_pickle(event_name + '_crf_list')#[1000:sample]
        event_tagged_tweets = load_pickle(event_name + '_tagged_tweets')#[1000:sample]
        
        return event_np_list,event_crf_list,event_tagged_tweets
    except:
        print(f'The {event_name} files not found! Run candidate_extraction.py file on the {eventname}_df')
        return None


In [17]:
def pipeline2(event_name,np_list):
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 1. LOAD THE DATA ~~~~~~~~~~~~~~~~~~~~~
    event_np_list,event_crf_list,event_tagged_tweets = load_event_data(event_name)
    event_np_list = np_list
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 2. GET POS AND NER TAGS ~~~~~~~~~~~~~~~~~~~~~
    # get easily accessible list of tuples (POS-tags of each word, NER-tags of each named entity) 
    tweet_tags = cand_prep.get_tweet_tags(event_tagged_tweets) 
    
    
    ####  ~~~~~~~~~~~~~~~~~~~~~ 3. PREPROCESS CANDIDATES ~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ processing of noun phrases ~~~~~~~~~~~~~~~~~~~~~
    print(f'Processing {event_name} noun phrase candidates...')
    
    tqdm.pandas()
    # remove NP candidates longer than threshold and remove all child NPs of parent NPs
    event_np_list = cand_prep.remove_long_nps(event_np_list)
    event_np_list = cand_prep.remove_child_nps(event_np_list) 
    #event_np_list = remove_weird_chars(event_np_list)
    event_np_list = cand_prep.remove_char(event_np_list,'@')

    event_np_list = [['no_candidate'] if len(noun_ps)==0 or noun_ps ==' ' else noun_ps for noun_ps in event_np_list ]
    
    #print(event_np_list)
    print(f'Tagging {event_name} noun phrase candidates...')
    #tag all tweets and save them in a list    

    #tagged_np_cands = batched_np_list.progress_apply(en_nlp)
    tagged_np_cands = [en_nlp('\n\n'.join(tweet_batch)) for tweet_batch in tqdm(event_np_list)]
    #tagged_np_cands = [tagged_cand for tagged_cand in tqdm(batch(batched_np_list, en_nlp, batch_size=6000))]

    np_cand_heads = [cand_prep.get_cand_heads(tweet_cands) for tweet_cands in tagged_np_cands]
    #print(np_cand_heads)
    
    np_and_cand_list = cand_prep.get_cand_type(event_np_list,np_cand_heads, tweet_tags)
    #print(event_np_list)
          
          
    # ~~~~~~~~~~~~~~~~~~~~ combining candidate lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #concatenate corefs and noun phrase lists
    nps_cands = [cand for cands in np_and_cand_list for cand in cands]
    #candidate_list = coref_and_cand_list + np_and_cand_list

    #unpack list of lists into one list
    candidate_list = nps_cands
          
    nps_tagged = [sent for tagged_cand in tagged_np_cands for sent in tagged_cand.sentences ]

    all_cands_tagged = nps_tagged

        
    #print(len(candidate_list),'vs', len(all_cands_tagged))
    cand_df = pd.DataFrame(
        {'candidates': candidate_list,
         'cand_tags': all_cands_tagged
        })

    cand_df['cand_text'] = cand_df.candidates.apply(lambda x: x[0])
    cand_df['cand_len'] = cand_df.cand_text.apply(lambda x: len(x.split()))


    count_cands = Counter(cand_df['cand_text'])
    cand_df['cand_freq'] = cand_df["cand_text"].map(count_cands)
    
    #count_cands[cand_df['cand_text']]
    #count_sorted = sorted(count_cands.items(),key=lambda x: x[1],reverse=True)
    cand_df.columns = cand_df.columns.str.strip()
    
          
    # we sort the candidates by their length
    cand_df.sort_values('cand_freq', ascending=False,inplace=True)

    #cand_df = cand_df[cand_df.cand_text not in  ['no_candidate', 'candidate_to_be_removed']]

    cand_df.reset_index(drop=True, inplace = True)
    #remove dummy candidates that were used to avoid errors

    
    cand_df = cand_df[cand_df.cand_text != 'candidate_to_be_removed']
    cand_df = cand_df[cand_df.cand_text != 'no_candidate']
    print(len(cand_df))    
    cand_df.reset_index(drop=True,inplace=True)
          
    return cand_df
          
#import random

#random.seed(42)
#np_list_sample = random.sample(np_list,10000)
event_cands = pipeline2('moria',np_list)

#pickle_file('moria_cands_df', moria_cands)

Loading moria data...


100%|███████████████████████████████████████████████████████████████████████████| 92806/92806 [08:40<00:00, 178.39it/s]


Processing moria noun phrase candidates...
removing long candidates...
Removed 0 candidates longer than 9 words!
removing child NP candidates...
Removed 0 child NP candidates!


  0%|                                                                                        | 0/92806 [00:00<?, ?it/s]

Tagging moria noun phrase candidates...


100%|█████████████████████████████████████████████████████████████████████████| 92806/92806 [11:17:03<00:00,  2.28it/s]
100%|████████████████████████████████████████████████████████████████████████████| 92806/92806 [16:39<00:00, 92.86it/s]


751019


Candidates as identified by stanza library still have a lot of noise that we want to remove so candidates merge better and we can throw away candidates that do not carry any information valuable for our analysis.

In [None]:
#Finally the candidates are cleaned before storing in a file prior to merging

from nltk.corpus import stopwords

def clean_cands(event_cands):
    """
    Applying cleaning steps on candidates and engineering some features:
     1. creating a column with length of the tweet (in chars)
     2. lowercase the candidate information in the tuple with cand, candidate representative head and set of phrases heads
     3. extract candidate text and keep only alphanumeric chars
     4. remove candidates that are stopwords
     5. remove candidates that are only numeric
     6. remove candidates that are only 1 char long
     """
    def clean_cand(cand):
        cand = list(cand)
        cand[0] = re.sub(r'[^A-Za-z0-9 ]+', '', cand[0].lower())
        cand[1] = re.sub(r'[^A-Za-z0-9 ]+', '', cand[1].lower())
        cand[2] = set([re.sub(r'[^A-Za-z0-9 ]+', '', phrase_word.lower()) for phrase_word in cand[2]])

        return tuple(cand)

    #stopwords
    tqdm.pandas()
    event_cands_clean = event_cands.copy()
    
    
    event_cands_clean['candidates'] = event_cands_clean['candidates'].progress_apply(clean_cand)
    
    event_cands_clean['cand_text'] = event_cands_clean['cand_text'].progress_apply(lambda x:re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()).strip())
    event_cands_clean = event_cands_clean[~event_cands_clean['cand_text'].isin(stopwords.words('english'))]
    event_cands_clean['pure_chars'] = event_cands_clean['cand_text'].progress_apply(lambda x: x.replace(' ', ''))
    event_cands_clean = event_cands_clean[~event_cands_clean['pure_chars'].str.isnumeric()]
    event_cands_clean.drop('pure_chars',axis=1,inplace=True)
    
    event_cands_clean['string_len'] = event_cands_clean['cand_text'].progress_apply(len)
    event_cands_clean = event_cands_clean[event_cands_clean['string_len']>1]
    event_cands_clean = event_cands_clean.drop_duplicates(subset = ["cand_text"])
    event_cands_clean.reset_index(drop=True, inplace=True)
    print(f'The event has  {len(event_cands_clean)} unique candidates after cleaning')
    return event_cands_clean

event_cands_clean = clean_cands(event_cands)

In [18]:
pickle_file('moria_short_cands', event_cands_clean)

In [22]:
event_cands = load_pickle('moria_short_cands')

## 4. We apply stanza module on the tweets to get NER and POS tags. We do it in batches to speed things up.

## 5. As initial WCL candidates, we extract noun phrases (NPs) and coreference chains.

## We do so using CoreNLPClient wrapper

### SOME PREPROCESSING NEEDED
* remove links - check
* remove # from hashtags? - check
* remove/merge mentions? - check


* remove recurring texts (signatures of news media) - any new spotted should be added in preprocessing file's '__remove_tweet_signatures__' function
* remove posts of some accounts (refugee_list)
* exclude NERs that tag numbers - should we mark phrase as NE if the head is not NE? - check
* play around with candidate types
* optimize code and make it neater



## 6. We keep only NPs shorter than 20 words and remove children of parent NPs 

## 7. We get the heads of noun phrases (in batches)

## 8. We define candidate types 

## 9. We assign candidate types to noun phrase candidates

## 10. We get coreference chains candidates from the tweet corpus

## 11. We determine candidate's type for representative mentions of coref candidates (in batches)

## 12. We combine the candidate lists for candidate merging

We organize candidates in a list sorted by their number of phrases

In [47]:
event_cands_merged = load_pickle("moria_short_cands_merged")
merged_dict = load_pickle("moria_short_whatmerged2")

## Frame identification

In [48]:
# from paper Shifting the refugee narratives? by Greussing & Boomgaarden (2015)
frame_properties = {'settlement':['settlement','accomodation','permanent','temporary','barracks','accommodated','shelter'],
                   'reception':['quota', 'distribution', 'limit', 'selection','reception','together','asylum','receive'],
                    'security':['security', 'border','crossing','fence','control','flow'],
                    'criminality':['officer','terror','suspicion','crime','offense','police','trafficking','suspect'],
                    'economisation':['euro','economic','million','thousand','cost','money'],
                    'humanitarian':['humane','voluntary','help','support','aid','care','solidarity'],
                    'victimization':['fight','victim','war','dead','rescued','state'],
                    'integration': ['labour','employed','unemployed','integration','positive'],
                    
                    #from hamborg
                    'affection':['affection','attachment', 'devotion', 'fondness','love','passion'],
                    'refusal': ['refusal','declination','denial','disallowance','nay','no'],
                    'trustworthiness':['trustworthiness','integrity','accuracy','credibility','authenticity','fairness'],
                    'no trustworthiness':['falsehood','dishonesty','unfairness','deceit','corruption'],
                    'reason': ['reason','logic','sense','rationale','argument','justification'],
                    'irrationality': ['unreason','irrationality','fallaciousness','unsoundness'],
                    'easiness': ['easiness','simplicity','obviousness','ease','comfort'],
                    'difficulty': ['difficulty','adversity','hardship','crisis','obstacle','trouble' ],
                    'honor': ['honor', 'dignity','esteem','reputation','praise'],
                    'dishonor': ['disgrace','dishonor','reproach','opprobrium']
                   
                   }



In [49]:
#import conceptnet_lite as cn
import gensim
import gensim.downloader as api

model = gensim.models.KeyedVectors.load_word2vec_format(r"C:/Users/nikodemicek/Dropbox (CBS)/Master thesis data/GoogleNews-vectors-negative300.bin.gz", binary=True)

manual_cands = ['refugee','migrant','greece','turkey','syria','beirut','immigrant','aoun']


# to run on the server we should use larger model according to the paper - "conceptnet-numberbatch-17-06-300"
#model = api.load("glove-twitter-200")


In [45]:
from sklearn.cluster import AffinityPropagation

model = load_pickle('moria_w2v_model')
tqdm.pandas()
event_cands_merged['avg_vec'] = event_cands_merged['candidates'].progress_apply(lambda x: phrase_heads_avg_vector(x[2]))
event_cands_merged

100%|███████████████████████████████████████████████████████████████████████████| 5886/5886 [00:00<00:00, 39270.69it/s]


Unnamed: 0,candidates,cand_tags,cand_text,cand_len,cand_freq,string_len,avg_vec
0,"(Greece, Greece, {Greece}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Greece"",\n ...",Greece,1,2066,6,"[0.29665634, -0.7863764, 1.0028423, -0.2250602..."
1,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,1156,8,"[-0.92243963, 0.7983021, 0.7725275, -0.3225837..."
2,"(fire, fire, {fire}, misc)","[\n {\n ""id"": 1,\n ""text"": ""fire"",\n ...",fire,1,778,4,"[1.0113711, 0.5400115, -2.0947576, 0.044214696..."
3,"(Turkey, Turkey, {Turkey}, loc-ne)","[\n {\n ""id"": 1,\n ""text"": ""Turkey"",\n ...",Turkey,1,545,6,"[-1.0134584, -0.9566454, 1.0516194, -0.4690687..."
4,"(migrants, migrants, {migrants}, misc)","[\n {\n ""id"": 1,\n ""text"": ""migrants"",\...",migrants,1,488,8,"[-0.3621457, 0.042272426, 0.37909022, -0.82739..."
...,...,...,...,...,...,...,...
5881,"(newsupdate, newsupdate, {newsupdate}, misc)","[\n {\n ""id"": 1,\n ""text"": ""newsupdate""...",newsupdate,1,1,10,
5882,"(western detention facilties, facilties, {faci...","[\n {\n ""id"": 1,\n ""text"": ""western"",\n...",western detention facilties,3,1,27,
5883,"(The child separations?, separations?, {separa...","[\n {\n ""id"": 1,\n ""text"": ""The"",\n ...",The child separations,3,1,22,
5884,"(Australias, Australias, {Australias}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Australias""...",Australias,1,1,10,"[0.008103137, 0.036747757, 0.01575262, 0.00544..."


In [50]:
ecm = event_cands_merged.dropna()
ecm

Unnamed: 0,candidates,cand_tags,cand_text,cand_len,cand_freq,string_len,avg_vec
0,"(Greece, Greece, {Greece}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Greece"",\n ...",Greece,1,2066,6,"[0.29665634, -0.7863764, 1.0028423, -0.2250602..."
1,"(refugees, refugees, {refugees}, misc)","[\n {\n ""id"": 1,\n ""text"": ""refugees"",\...",refugees,1,1156,8,"[-0.92243963, 0.7983021, 0.7725275, -0.3225837..."
2,"(fire, fire, {fire}, misc)","[\n {\n ""id"": 1,\n ""text"": ""fire"",\n ...",fire,1,778,4,"[1.0113711, 0.5400115, -2.0947576, 0.044214696..."
3,"(Turkey, Turkey, {Turkey}, loc-ne)","[\n {\n ""id"": 1,\n ""text"": ""Turkey"",\n ...",Turkey,1,545,6,"[-1.0134584, -0.9566454, 1.0516194, -0.4690687..."
4,"(migrants, migrants, {migrants}, misc)","[\n {\n ""id"": 1,\n ""text"": ""migrants"",\...",migrants,1,488,8,"[-0.3621457, 0.042272426, 0.37909022, -0.82739..."
...,...,...,...,...,...,...,...
5874,"(France' s, France', {France', s}, misc)","[\n {\n ""id"": 1,\n ""text"": ""France'"",\n...",France s,2,1,9,"[0.44062924, 0.55070215, -0.049154773, 0.35485..."
5875,"(battering ram (again), battering, {battering,...","[\n {\n ""id"": 1,\n ""text"": ""battering"",...",battering ram again,3,1,21,"[-0.008449046, 0.04736015, 0.028447097, 0.0139..."
5879,"(Yall, Yall, {Yall}, misc)","[\n {\n ""id"": 1,\n ""text"": ""Yall"",\n ...",Yall,1,1,4,"[-0.107103385, 0.2241038, 0.0412581, 0.0283824..."
5880,"(this statement insinuate, insinuate, {insinua...","[\n {\n ""id"": 1,\n ""text"": ""this"",\n ...",this statement insinuate,3,1,24,"[-0.016040474, 0.567505, 0.2532831, 0.31219295..."


In [61]:
from sklearn.cluster import AffinityPropagation
import numpy as np
X = np.array(list(ecm['avg_vec']))
t0 = time.time()
clustering = AffinityPropagation(damping = 0.95, max_iter=1000, convergence_iter=10,verbose=True, random_state=42).fit(X)
print(f'it took {time.time()-t0} seconds')

ecm['label'] = clustering.labels_

Converged after 328 iterations.
it took 165.0170841217041 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [85]:
ecm[ecm['label']==201]

Unnamed: 0,candidates,cand_tags,cand_text,cand_len,cand_freq,string_len,avg_vec,label
2742,"(safe shelter Have, Have, {shelter, Have}, misc)","[\n {\n ""id"": 1,\n ""text"": ""safe"",\n ...",safe shelter Have,3,1,17,"[-0.04584418, -0.42060813, -0.8032175, 0.38723...",201


In [229]:
event_df= load_pickle('moria_df_with_clusters')
event_df

Unnamed: 0,source,text,lang,id,created_at,author_id,retweet_count,reply_count,like_count,quote_count,...,year_month,year_calendar_week,refugee,migrant,immigrant,asylum_seeker,other,is_dup,text_clean_right,label
0,Twitter Web App,"Canada's immigrant population is 20%, USA is 1...",en,1267244723103690753,2020-06-01 00:01:00+00:00,442949745,1,1,9,1,...,2020_6,2020_22,False,False,True,False,False,False,canada s immigrant population is 20 usa is 13 ...,0.0
1,Twitter Web App,Hi @EUHomeAffairs @Place_Beauvau @BMI_Bund @uk...,en,1267247183725621248,2020-06-01 00:10:47+00:00,211570886,0,0,0,0,...,2020_6,2020_22,True,False,False,False,False,False,hi euhomeaffairs placebeauvau bundesministeriu...,0.0
2,Twitter Web App,"#Greece Dozens of Asylum seekers, who face the...",en,1267251185838407681,2020-06-01 00:26:41+00:00,119888012,12,2,11,0,...,2020_6,2020_22,False,False,False,False,False,False,greece dozens of asylum seekers who face the r...,5.0
3,Twitter Web App,"Hmmm? Maybe not, Spain is the COVID-19 hot spo...",en,1267260557213806599,2020-06-01 01:03:56+00:00,4839872717,0,0,0,0,...,2020_6,2020_22,False,False,True,False,False,False,hmmm maybe not spain is the covid 19 hot spot...,0.0
4,Twitter Web App,"Greece to evict over 10,000 refugees from shel...",en,1267264681108025346,2020-06-01 01:20:19+00:00,1171990967001526272,0,0,0,0,...,2020_6,2020_22,True,False,False,False,False,False,greece to evict over 10000 refugees from shelters,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92801,SocialFlow,UPDATE: Officials say 19 people have been disp...,en,1366162926491598851,2021-02-28 23:06:38+00:00,15357193,1,0,1,0,...,2021_2,2021_08,False,False,False,False,True,False,update officials say 19 people have been displ...,0.0
92802,dlvr.it,19 people displaced from home following apartm...,en,1366163153772404739,2021-02-28 23:07:32+00:00,133496245,0,0,0,0,...,2021_2,2021_08,False,False,False,False,True,False,19 people displaced from home following apartm...,0.0
92803,TweetDeck,we all hope a better year that we can touch ea...,en,1366166599800139780,2021-02-28 23:21:14+00:00,75055396,0,0,0,0,...,2021_2,2021_08,True,True,False,False,False,False,we all hope a better year that we can touch ea...,3.0
92804,Nicholas Franklin,HEADLINE: Greece migrants: Afghan father charg...,en,1366169485162336257,2021-02-28 23:32:42+00:00,205329051,0,0,0,0,...,2021_2,2021_08,False,False,False,False,False,True,,
