## In this notebook, we assign frame properties to candidates detected in the previous step.

**Necessary files:**
 - event_df (event specific dataframe with cleaned text)
 - frame_properties (dictionary with frames and their related keywords)
 - event_tagged_tweets (annotated tweets using stanza library)
 - model (word2vec model trained on event specific data (or the entire set))
 - cands (dataframe with tagged candidates, their heads, sets of phrases heads and cand_types) 
 - merged_dict (dictionary with merged candidates)
 

In [None]:
from nltk.corpus import stopwords
from collections import defaultdict
import nltk

nltk.download('stopwords')

stop_words = list(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

In [115]:
# run this if  [event]_word_properties does not exist


print('preprocessing tweets...')
tqdm.pandas()
tweets_corpus = list(event_df['text_clean_left'])#.progress_apply(preprocessing.preprocess_tweets))


print('assigning frame properties to words from tweets...')
word_properties = defaultdict(dict)
for i in tqdm(tweets_corpus):
    tweet_words = [word.lower() for word in i.split() if word not in stop_words and len(word)>1]
    for word in tweet_words:
        #print(word)
        word = lemma.lemmatize(word)
        property_list = []
        #print(list(frame_properties.keys()))
        for prop in list(frame_properties.keys()):
            #print(frame_properties[prop])
            
            try:
                #print(f'sim of {word}, {prop} is {model.similarity(word, prop)}')
                weights = [model.similarity(word, seed) for seed in frame_properties[prop]]
                #print(weights)
                if max(weights)>0.4:
                    word_properties[word][prop] = max(weights)
            except KeyError:
                pass
            



print(word_properties)
        

            

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikodemicek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from pandas import Panel
  0%|                                                                                        | 0/92806 [00:00<?, ?it/s]

preprocessing tweets...
assigning frame properties to words from tweets...


100%|████████████████████████████████████████████████████████████████████████████| 92806/92806 [51:55<00:00, 29.79it/s]

defaultdict(<class 'dict'>, {'racism': {'no trustworthiness': 0.45143962}, 'hate': {'affection': 0.6003957}, 'thousand': {'economisation': 1.0}, 'refugee': {'reception': 0.60989624}, 'dozen': {'economisation': 0.47637963}, 'asylum': {'reception': 1.0}, 'homeless': {'settlement': 0.57510066}, 'accommodation': {'settlement': 0.8532543}, 'protest': {'humanitarian': 0.40294492}, 'camp': {'settlement': 1.0}, 'want': {'affection': 0.43143085}, 'shelter': {'settlement': 1.0}, 'dead': {'victimization': 1.0}, 'why': {'reason': 0.61324227}, 'help': {'humanitarian': 1.0}, 'chief': {'criminality': 0.622512}, 'support': {'humanitarian': 1.0}, 'illegal': {'criminality': 0.45195127}, 'arrest': {'criminality': 0.5394254}, 'displaced': {'settlement': 0.47044814}, 'country': {'victimization': 0.43598586}, 'man': {'criminality': 0.5247485, 'victimization': 0.52303016}, 'racist': {'dishonor': 0.4069227}, 'excuse': {'reason': 0.60485476}, 'police': {'criminality': 1.0}, 'unaccounted': {'victimization': 0.4




In [117]:
pickle_file('moria_word_properties',word_properties)

In [50]:
tagged_tweets = load_pickle('moria_tagged_tweets')
word_properties = load_pickle('moria_word_properties')
#coref_chains = load_pickle('moria_crf_list')

#coref_chains[2]

In [51]:
def assign_frame_properties(event_df, tagged_tweets,cands):
    # import these modules 
    from collections import defaultdict
    from nltk.stem import WordNetLemmatizer 
    from nltk.corpus.reader.wordnet import NOUN
    import numpy as np    
    
    def find_biased_words(cand):
        rep_head = lemma.lemmatize(cand[1].lower(),pos=NOUN)
        #add bias by word choice
        add_dataframe_entry(cand,rep_head)
        if rep_head in tagged_tweets[tweet_id].text.lower() and len(rep_head)>1:
            #find all dependencies of the phrase head
            for related in range(len(cand_words)):
                cand_word_lemma = lemma.lemmatize(cand_words[related][1].lower())
                #print(f'Yes it is, related = {lemma.lemmatize(cand_heads[related][1].lower(),pos=NOUN)}')
                if rep_head == cand_word_lemma:
                    related_word = lemma.lemmatize(cand_words[cand_words[related][2]-1][1].lower())
                    #add bias by labeling
                    add_dataframe_entry(cand,related_word) 
    
    def add_dataframe_entry(cand,biased_word):
        cand_frames['word'].append(cand[0])
        cand_frames['date'].append(event_df['date'][tweet_id])
        #cand_frames['word'].append(phrase_head)
        for frame_property in list(frame_properties.keys()):
            #print(frame_property)
            try:
                #print(word_properties[phrase_head][frame_property])
                cand_frames[frame_property].append(word_properties[biased_word][frame_property])
            except KeyError:
                #cand_frames[frame_property].append(word_properties['tent'][frame_property])
                cand_frames[frame_property].append(np.NaN)
                
 
                        
    lemma = WordNetLemmatizer() 
    cand_frames = defaultdict(list)

    framed_words = pd.DataFrame(columns=['word','date',list(frame_properties.keys())])
    event_df[['date','time']] = event_df['created_at'].str.split(' ',expand=True)

    for tweet_id in tqdm(event_df.index):
        
        cand_words = [[word.id, word.text,word.head] for sent in tagged_tweets[tweet_id].sentences for word in sent.words]
        #print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet_id].sentences for word in sent.words], sep='\n')
        #print(len(cand_df['candidates']))
        for cand in cands['candidates']:
            """ THIS IS AN IMPORTANT PART, IT HANDLES ASSIGNING BIAS FOR MERGED CANDIDATES
            logic:  first check if the entire candidate is in the tweet. If yes, add bias of its head and all labels.
             Then do the same for every candidate that was merged into the main candidate.
             If the main candidate is not in the tweet, check each candidate that was merged into it.
             If neither the main, nor its merged candidates are in the tweet, use the head of the main candidate only"""
            if cand[0].lower() in tagged_tweets[tweet_id].text.lower() and len(cand[0])>1:
                find_biased_words(cand)
                try:
                    for merged_cand in merged_dict[cand[0].lower()]:
                        if merged_cand[0].lower() in tagged_tweets[tweet_id].text.lower() and len(merged_cand[0])>1:
                            find_biased_words(merged_cand)
                except KeyError:
                    pass

            if cand[0].lower() not in tagged_tweets[tweet_id].text.lower():
                try:
                    for merged_cand in merged_dict[cand[0].lower()]:
                        if merged_cand[0].lower() in tagged_tweets[tweet_id].text.lower() and len(merged_cand[0])>1:
                            find_biased_words(merged_cand)
                except KeyError:
                    pass
            else:
                find_biased_words(cand)
    
    return cand_frames


cand_frames = assign_frame_properties(event_df,tagged_tweets,event_cands_merged)
#cand_frames0 = assign_frame_properties(event_df[event_df['label']==0],tagged_tweets,event_cands_merged4[:100])
#cand_frames1 = assign_frame_properties(event_df[event_df['label']==1],tagged_tweets,event_cands_merged4[:100])
#cand_frames2 = assign_frame_properties(event_df[event_df['label']==2],tagged_tweets,event_cands_merged4[:100])
#cand_frames3 = assign_frame_properties(event_df[event_df['label']==3],tagged_tweets,event_cands_merged4[:100])
#cand_frames4 = assign_frame_properties(event_df[event_df['label']==4],tagged_tweets,event_cands_merged4[:100])
#cand_frames5 = assign_frame_properties(event_df[event_df['label']==5],tagged_tweets,event_cands_merged4[:100])
#cand_frames6 = assign_frame_properties(event_df[event_df['label']==6],tagged_tweets,event_cands_merged4[:100])

    
                
        

100%|██████████████████████████████████████████████████████████████████████████| 92806/92806 [1:22:16<00:00, 18.80it/s]


In [19]:
#print(cand_frames.keys())
framed_words = pd.DataFrame.from_dict(cand_frames)

#framed_words0 = pd.DataFrame.from_dict(cand_frames0)
#framed_words1 = pd.DataFrame.from_dict(cand_frames1)
#framed_words2 = pd.DataFrame.from_dict(cand_frames2)
#framed_words3 = pd.DataFrame.from_dict(cand_frames3)
#framed_words4 = pd.DataFrame.from_dict(cand_frames4)
#framed_words5 = pd.DataFrame.from_dict(cand_frames5)
#framed_words6 = pd.DataFrame.from_dict(cand_frames6)

#framed_words[framed_words['word']=='migrants'].tail(50)

#framed_words = framed_words.dropna(subset=['settlement', 'reception', 'security', 'criminality', 'economisation', 'humanitarian', 'victimization', 'integration', 'affection', 'refusal', 'trustworthiness', 'no trustworthiness', 'reason', 'unreason/irrationality', 'easiness', 'difficulty', 'honor', 'dishonor'],how='all')

framed_words

Unnamed: 0,word,date,settlement,reception,security,criminality,economisation,humanitarian,victimization,integration,affection,refusal,trustworthiness,no trustworthiness,reason,irrationality,easiness,difficulty,honor,dishonor
0,fire,2020-06-01,,,,,,,,,,,,,,,,,,
1,fire,2020-06-01,,,,,,,,,,,,,,,,,,
2,fire,2020-06-01,,,,,,,,,,,,,,,,,,
3,fire,2020-06-01,,,,,,,,,,,,,,,,,,
4,fire,2020-06-01,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6070703,CA,2021-02-28,,,,,,,,,,,,,,,,,,
6070704,NC,2021-02-28,,,,,,,,,,,,,,,,,,
6070705,NC,2021-02-28,,,,,,,,,,,,,,,,,,
6070706,NY,2021-02-28,,,,,,,,,,,,,,,,,,


In [21]:
framed_words['word'].value_counts().nlargest(100)

And        205456
GR         147800
Greece     146876
He         144550
fire       139706
            ...  
Germany     13086
the EU      13038
MPs         12964
need        12818
war         12764
Name: word, Length: 100, dtype: int64

In [348]:
the_word = 'refugee'

def aggregate_frames(framed_words):
    merged_frames = framed_words.copy()

    #framed_words[framed_words['date']=='2020-09-04']

    merged_frames['no trustworthiness'] = - merged_frames['no trustworthiness']
    merged_frames['refusal'] = - merged_frames['refusal']
    merged_frames['irrationality'] = -merged_frames['irrationality']
    merged_frames['difficulty'] = -merged_frames['difficulty'] 
    merged_frames['dishonor'] = -merged_frames['dishonor']


    trust = ['trustworthiness', 'no trustworthiness']
    honor = ['honor', 'dishonor']
    affection = ['affection','refusal']
    reason = ['reason','irrationality']
    easiness = ['easiness','difficulty']

    merged_frames = pd.lreshape(merged_frames,
                            {'reason':reason, 'honor':honor,'affection':affection,'trust':trust,'easiness':easiness},
                           dropna=False)


    merged_frames = merged_frames[merged_frames['word'] == the_word]
    
    merged_frames['week'] = pd.to_datetime(merged_frames['date'], format='%Y-%m-%d').dt.strftime('%W')
    #aggr_frames = merged_frames.groupby(['word','week'],as_index=False).mean()
    aggr_frames = merged_frames.groupby(['word'],as_index=False).mean()
    frame_size = merged_frames.groupby(['word','week'],as_index=False).size()

    return aggr_frames,frame_size

aggr_frames,_ = aggregate_frames(framed_words)
aggr_frames0,size0 = aggregate_frames(framed_words0)
aggr_frames1,size1 = aggregate_frames(framed_words1)
aggr_frames2,size2 = aggregate_frames(framed_words2)
aggr_frames3,size3 = aggregate_frames(framed_words3)
aggr_frames4,size4 = aggregate_frames(framed_words4)
aggr_frames5,size5 = aggregate_frames(framed_words5)
#aggr_frames6 = aggregate_frames(framed_words6)
aggr_frames

af = pd.concat([aggr_frames0,aggr_frames1,aggr_frames2,aggr_frames3,aggr_frames4,aggr_frames5])
af

Unnamed: 0,word,criminality,economisation,humanitarian,integration,reception,security,settlement,victimization,reason,honor,affection,trust,easiness
0,refugee,0.538906,0.868006,0.714931,,0.609365,0.684697,0.970225,0.552637,0.489588,0.200376,-0.148064,-0.2224,-0.73309
0,refugee,0.473278,0.846614,0.697133,,0.609725,0.540466,0.974121,0.543286,0.505104,0.167807,-0.151933,-0.237938,-0.596747
0,refugee,0.587951,0.862315,0.706727,,0.609358,0.623774,0.977574,0.538553,0.555089,0.087064,-0.137056,-0.290872,-0.469395
0,refugee,0.542529,0.931133,0.719745,,0.608951,0.722851,0.980443,0.566759,0.497186,-0.010324,-0.084682,-0.247231,-0.607755
0,refugee,0.512998,0.740236,0.694875,,0.609102,0.654367,0.935843,0.551111,0.475116,-0.012453,-0.070148,-0.184379,-0.590338
0,refugee,0.602713,0.908098,0.685298,,0.608965,1.0,0.987103,0.57802,0.516141,0.005731,-0.212431,-0.467955,-0.650921


In [347]:
size0

NameError: name 'size0' is not defined

In [338]:
aggr_frames.columns[2:]

Index(['criminality', 'economisation', 'humanitarian', 'integration',
       'reception', 'security', 'settlement', 'victimization', 'reason',
       'honor', 'affection', 'trust', 'easiness'],
      dtype='object')

In [None]:
fig = px.bar(data_canada, x='year', y='pop')
fig.show()

In [307]:
import plotly.express as px
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])

frame = 'easiness'

ax1 = px.line(aggr_frames0, x="week", y=[frame],render_mode='webgl')
ax2 = px.line(aggr_frames1,x="week", y=[frame],render_mode='webgl')
ax3 = px.line(aggr_frames2,x="week", y=[frame],render_mode='webgl')
ax4 = px.line(aggr_frames3,x="week", y=[frame],render_mode='webgl')
ax5 = px.line(aggr_frames4,x="week", y=[frame],render_mode='webgl')
ax6 = px.line(aggr_frames5,x="week", y=[frame],render_mode='webgl')

#ax2 = px.line(frame_size, x="date", y=['size'],render_mode='webgl')

#ax2.update_traces(yaxis='y2')
fig.add_traces(ax1.data + ax2.data + ax3.data + ax3.data + ax5.data + ax6.data)

fig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))

fig.show()

KeyError: 'variable'

In [339]:
import plotly.express as px
for frame in aggr_frames.columns[2:]:
    print(frame)
    try:
        fig = px.line(aggr_frames, x="week", y=[frame], title=f'Frame bias towards {the_word}')
        fig.show()
    except ValueError:
        pass

criminality


economisation


humanitarian


integration


reception


security


settlement


victimization


reason


honor


affection


trust


easiness


# TESTING:

In [None]:
# batching the tweets speeds the model considerably and is enabled by splitting sentences using '\n\n' 
from stanza_batch import batch
from nltk.tokenize import sent_tokenize

# the sampled_df series should be converted to list and sentences separated with "\n\n"
all_tweets_list = list(tweets_corpus)[:50] 
for tweet in range(len(all_tweets_list)):
    tweet_sentokenized = sent_tokenize(all_tweets_list[tweet])
    if tweet_sentokenized == []:
        tweet_sentokenized.append('empty_tweet')
        print(f'empty tweet at index {tweet}')
    all_tweets_list[tweet] = "\n\n".join(tweet_sentokenized)


#tag all tweets and save them in a list    
tagged_tweets = [] 
for tweet in tqdm(batch(all_tweets_list, en_nlp, batch_size=1000)): # Default batch size is 32
        tagged_tweets.append(tweet)

# the tweet text can now be accessed using .text method        
tagged_tweets[0].text

In [None]:


for tweet in tqdm(range(len(tweets_corpus))):
    print(tweets_corpus[tweet])
    np_heads = [[word.id, word.text,word.head,word.deprel] for sent in tagged_tweets[tweet].sentences for word in sent.words]
    print(*[f'id: {word.id}\tword: {word.text:<15}head id: {word.head:<5}head: {sent.words[word.head-1].text if word.head > 0 else "root":<10}deprel: {word.deprel}' for sent in tagged_tweets[tweet].sentences for word in sent.words], sep='\n')
    #print(np_heads)
    ph_ids = set([np_heads[i][2] for i in range(len(np_heads))])
    ph_words = [np_heads[i-1][1] for i in ph_ids]

    word_pairs = [(np_heads[word][1], np_heads[np_heads[word][2]-1][1]) for word in range(len(np_heads)) if np_heads[word][2] != 0]
    #print(word_pairs)
    
    compounds = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if 'compound' in np_heads[i][3]]
    print(compounds)
    
    advmods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='advmod']
    print(advmods)
    
    amods = [[np_heads[i][1]+'_'+np_heads[np_heads[i][2]-1][1]] for i in range(len(np_heads)) if np_heads[i][3]=='amod']
    print(amods)
    for pair in word_pairs:
        phrase = pair[0]+'_'+pair[1]

    #print(model.most_similar('illegal_immigrant'))

    
    """#print(len(cand_df['candidates']))
    candidate_list = cand_df['candidates']
    for cand in cand_df['candidates']:
        #print(cand[2])
        #print(get_head(str(cand)))
        for phrase_head in cand[2]:
            #print(phrase_head)
            #if str(cand[1]) in str(tweet):
            if str(phrase_head) in str(tweet) and len(phrase_head)>2:
                #print(phrase_head)
                ph_words = [np_heads[i-1][1] for i in phrase_heads]
                #print(ph_words)
                for related in range(len(np_heads)):
                    if phrase_head == np_heads[related][1]:
                        pass
                        #print(f'checking {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                  for frame_property in list(frame_properties.keys()):
                        for seed_word in frame_properties[frame_property]:
                        try:
                            for related in range(len(np_heads)):
                                #print(np_heads[related])
                                #if cand[1] == np_heads[related][1]:
                                if phrase_head == np_heads[related][1]:
                                    print(f'checking {seed_word} and {phrase_head}_{np_heads[np_heads[related][2]-1][1]}')
                                    #cand_frames[seed_word][phrase_head].append(model.similarity(seed_word,np_heads[np_heads[related][2]-1][1]))
                                #[cand_frames[seed_word][cand].append(model.similarity(seed_word,np_heads[np_heads[related][2]+1][1])) if cand == np_heads[related][1] else print('') for related in range(len(np_heads))]
                                except KeyError:
                                pass"""