# Frame identification

2 versions - By factor analysis and by using BERT embeddings and clustering (affinity propagation)
https://www.datacamp.com/community/tutorials/introduction-factor-analysis

In [2]:
import sentence_transformers

In [3]:
print(sentence_transformers.__version__)

2.0.0


In [100]:
import os

import pandas as pd

import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import sent_tokenize

from time import time
from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt

In [217]:
# functions pickle_file and load_pickle merely help with storing pickled files in the event folders on drive
def pickle_file(file_name, file_to_dump):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path +  fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, 'wb') as fp:
        pickle.dump(file_to_dump, fp)

def load_pickle(file_name):
    directory_path = os.getcwd() + "/../../../../"
    folder_name = file_name.split('_')[0]
    file_path = directory_path + fr"Dropbox (CBS)/Master thesis data/Candidate Data/{folder_name}/{file_name}"
    with open(file_path, "rb") as input_file:
        return pickle.load(input_file)

In [101]:
file_url = os.getcwd() + "/../../../../" + r"/Dropbox (CBS)/Master thesis data/"
event_url = file_url + r"Event Dataframes/"
event_url_clean = event_url + r"Clean/"

candidate_url = file_url + r"Candidate Data/"

In [102]:
tigray_url_clean = event_url_clean + r"df_tigray_clean.csv" # location of clean Tigray dataset
greece_url_clean = event_url_clean + r"df_greece_clean.csv" # location of clean Greece dataset
rohingya_url_clean = event_url_clean + r"df_rohingya_clean.csv" # location clean of Rohingya dataset
channel_url_clean = event_url_clean +r"df_channel_clean.csv" #Location of clean Channel dataset

In [123]:
tigray_url_fi = event_url_clean + r"df_tigray_fi.csv" # location of Tigray dataset for frame identification
greece_url_fi = event_url_clean + r"df_greece_fi.csv" # location of Greece dataset for frame identification
rohingya_url_fi = event_url_clean + r"df_rohingya_fi.csv" # location of Rohingya dataset for frame identification
channel_url_fi = event_url_clean +r"df_channel_fi.csv" #Location of Channel dataset for frame identification

In [103]:
tigray_candidate_url = candidate_url + r"tigray/tigray_ents"
greece_candidate_url = candidate_url + r"greece/greece_ents"
rohingya_candidate_url = candidate_url + r"rohingya/rohingya_ents"
channel_candidate_url = candidate_url + r"channel/channel_ents"

In [104]:
with open(greece_candidate_url,"rb") as input_file:
    ents = pickle.load(input_file)

In [105]:
def get_entity_list(url):
    with open(url,"rb") as input_file:
        ents = pickle.load(input_file)
        ents = ents[ents["freq"]>15]
        
    return list(ents["entity"])

In [106]:
tigray_ents = get_entity_list(tigray_candidate_url)
greece_ents = get_entity_list(greece_candidate_url)
rohingya_ents = get_entity_list(rohingya_candidate_url)
channel_ents = get_entity_list(channel_candidate_url)

In [107]:
ne_list = set(tigray_ents + greece_ents + rohingya_ents + channel_ents)

In [108]:
def read_event_df(data_url):
    # easy dataframe load
    event_df = pd.read_csv(data_url, index_col=0)
    event_df.reset_index(drop=True, inplace=True)
    event_df = event_df[event_df['text_stm'].notna()]
    print(f'loaded {event_df.shape[0]} tweets!')
    return event_df

In [109]:
df_tigray = read_event_df(tigray_url_clean)
df_greece = read_event_df(greece_url_clean)
df_rohingya = read_event_df(rohingya_url_clean)
df_channel = read_event_df(channel_url_clean)

loaded 42843 tweets!
loaded 137418 tweets!
loaded 29423 tweets!
loaded 173615 tweets!


In [204]:
from collections import Counter
from nltk.tokenize import word_tokenize

all_words = list()

for words in df_channel["text_stm"]:
    words_tok = word_tokenize(words)
    for word in words_tok:
        all_words.append(word)

In [205]:
from collections import Counter

counter = Counter(all_words)
counter.most_common() #counter_obj.most_common(n=10)

most_frequent_words = [pair[0] for pair in counter.most_common(int(len(counter)*0.025))]

In [206]:
len(most_frequent_words)

1245

In [190]:
def get_most_frequent_words(df_col, min_words):
    
    all_words = list()
    
    for words in df_col:
        for word in words:
            all_words.append(word)
            
    counter = Counter(all_words)
    
    return [pair[0] for pair in counter.most_common(int(len(counter)*min_words))] #0.025

In [114]:
def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing tweets...\n")
    return df_col.apply(lambda x: word_tokenize(x))

def remove_unfrequent_words(df_col):
    print("Removing unfrequent words...\n")
    most_frequent_words = get_most_frequent_words(df_col)
    print(f"(Removing words that are not among {len(most_frequent_words)} most frequent ones.)\n")
    return df_col.apply(lambda x: [token for token in x if token in most_frequent_words])

def remove_named_entities(df_col):
    print("Removing named entities...\n")
    return df_col.apply(lambda x: [token for token in x if token not in ne_list])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

In [193]:
def remove_words(df_col, min_words):
    
    most_frequent_words = get_most_frequent_words(df_col, min_words)
    print(f"(Removing words that are not among {len(most_frequent_words)} most frequent ones.)\n")
    
    words_to_keep = [word for word in most_frequent_words if word not in ne_list]
    
    df_col =  df_col.apply(lambda x: [token for token in x if token in words_to_keep])
    return df_col.apply(lambda x: " ".join([token for token in x]))

In [188]:
df_greece["tok"] = tokenization(df_greece["text_stm"])

Tokenizing tweets...



In [196]:
df_greece["text_frame_identification_001"] = remove_words(df_greece["tok"],0.001)
df_greece["text_frame_identification_0025"] = remove_words(df_greece["tok"],0.025)
df_greece["text_frame_identification_005"] = remove_words(df_greece["tok"],0.05)
df_greece["text_frame_identification_01"] = remove_words(df_greece["tok"],0.1)

(Removing words that are not among 50 most frequent ones.)

(Removing words that are not among 1251 most frequent ones.)

(Removing words that are not among 2503 most frequent ones.)

(Removing words that are not among 5006 most frequent ones.)

(Removing words that are not among 25030 most frequent ones.)



KeyboardInterrupt: 

In [115]:
df_greece["text_frame_identification"] = preprocessing(df_greece["text_stm"],
                                                  tokenization,
                                                  remove_unfrequent_words,
                                                  remove_named_entities)

most_frequent_words = get_most_frequent_words(df_greece["text_stm"].apply(lambda x: word_tokenize(x)))
words_to_cluster_greece = [word for word in most_frequent_words if word not in ne_list]

Tokenizing tweets...

Removing unfrequent words...

(Removing words that are not among 1251 most frequent ones.)

Removing named entities...



In [116]:
df_tigray["text_frame_identification"] = preprocessing(df_tigray["text_stm"],
                                                  tokenization,
                                                  remove_unfrequent_words,
                                                  remove_named_entities)

most_frequent_words = get_most_frequent_words(df_tigray["text_stm"].apply(lambda x: word_tokenize(x)))
words_to_cluster_tigray = [word for word in most_frequent_words if word not in ne_list]

Tokenizing tweets...

Removing unfrequent words...

(Removing words that are not among 649 most frequent ones.)

Removing named entities...



In [117]:
df_channel["text_frame_identification"] = preprocessing(df_channel["text_stm"],
                                                  tokenization,
                                                  remove_unfrequent_words,
                                                  remove_named_entities)

most_frequent_words = get_most_frequent_words(df_channel["text_stm"].apply(lambda x: word_tokenize(x)))
words_to_cluster_channel = [word for word in most_frequent_words if word not in ne_list]

Tokenizing tweets...

Removing unfrequent words...

(Removing words that are not among 1245 most frequent ones.)

Removing named entities...



In [118]:
df_rohingya["text_frame_identification"] = preprocessing(df_rohingya["text_stm"],
                                                  tokenization,
                                                  remove_unfrequent_words,
                                                  remove_named_entities)

most_frequent_words = get_most_frequent_words(df_rohingya["text_stm"].apply(lambda x: word_tokenize(x)))
words_to_cluster_rohingya = [word for word in most_frequent_words if word not in ne_list]

Tokenizing tweets...

Removing unfrequent words...

(Removing words that are not among 486 most frequent ones.)

Removing named entities...



In [197]:
df_greece.columns

Index(['source', 'text', 'lang', 'id', 'created_at', 'author_id',
       'retweet_count', 'reply_count', 'like_count', 'quote_count',
       'withheld.scope', 'hashtags', 'mentions', 'annotations', 'text_clean',
       'year', 'calendar_week', 'year_month', 'year_calendar_week', 'refugee',
       'migrant', 'immigrant', 'asylum_seeker', 'other', 'date',
       'text_coherent', 'retweet_count_sum', 'count', 'text_alphanum',
       'text_stm', 'text_frame_identification', 'tok',
       'text_frame_identification_0025', 'text_frame_identification_001',
       'text_frame_identification_005', 'text_frame_identification_01'],
      dtype='object')

In [198]:
df_greece_test = df_greece[["text","text_coherent","text_frame_identification","date",'text_frame_identification_0025', 'text_frame_identification_001',
       'text_frame_identification_005', 'text_frame_identification_01']]

In [200]:
#df_greece_test.to_csv('C:\\Users\\jawo19ad\\Documents\\GitHub\\refugee_project\\Code/../../../..//Dropbox (CBS)/Master thesis data/Event Dataframes/Clean/df_greece_fi_test.csv')

In [121]:
df_greece_frame = df_greece[["text","text_coherent","text_frame_identification","date"]]
df_tigray_frame = df_tigray[["text","text_coherent","text_frame_identification","date"]]
df_rohingya_frame = df_rohingya[["text","text_coherent","text_frame_identification","date"]]
df_channel_frame = df_channel[["text","text_coherent","text_frame_identification","date"]]

df_greece_frame.to_csv(greece_url_fi)
df_tigray_frame.to_csv(tigray_url_fi)
df_rohingya_frame.to_csv(rohingya_url_fi)
df_channel_frame.to_csv(channel_url_fi)

## Create Embeddings

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
embeddings = vectorizer.fit_transform(df_greece["frame_identification"])

In [None]:
embeddings_dense = embeddings.toarray()

In [150]:
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

In [None]:
# Bartlett’s test of sphericity (want to have p-value of 0)
chi_square_value, p_value=calculate_bartlett_sphericity(embeddings_dense)
chi_square_value, p_value

In [None]:
kmo_all,kmo_model=calculate_kmo(embeddings_dense)

In [None]:
kmo_model

In [None]:
sum(kmo_all > 0.5)

In [None]:
?FactorAnalyzer

In [None]:
fa = FactorAnalyzer(rotation="varimax")
fa.fit(embeddings_dense)
print("Fit finished")
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev

In [None]:
# Create scree plot using matplotlib
plt.scatter(range(1,embeddings_dense.shape[1]+1),ev)
plt.plot(range(1,embeddings_dense.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
?fa.fit

In [None]:
fa = FactorAnalyzer(9, rotation="varimax")
fa.fit(embeddings_dense)

In [None]:
fa.loadings_

In [None]:
factor_dict =dict()
for word,factor in zip(words_to_cluster,fa.loadings_):
    factor_dict[word] = factor
    
factor_df = pd.DataFrame.from_dict(factor_dict)
factor_df

In [None]:
factor_df["criminal"]

In [None]:
factor_df_transposed[0].sum()

In [None]:
factor_df_transposed = factor_df.T 
factor_df_transposed[factor_df_transposed[0]>0.4]

In [None]:
factor_df_transposed[factor_df_transposed[8]>0.3]

In [None]:
vector_dict =dict()
for word,vector in zip(words_to_cluster,embeddings):
    vector_dict[word] = vector
    
vector_df = pd.DataFrame.from_dict(vector_dict)

### Train Bert

In [50]:
def create_bert_embedding(df, words_to_cluster):
    
    tweet_sentences = [sent for tweet in df['text_alphanum'] for sent in sent_tokenize(tweet)]
    sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

    bert_corpus = tweet_sentences + words_to_cluster

    print(len(bert_corpus))
    t0 = time()
    document_embeddings = sbert_model.encode(bert_corpus)
    print(f'Training embeddings took {time()-t0} seconds')
    
    return document_embeddings

In [52]:
greece_bert = create_bert_embedding(df_greece, words_to_cluster_greece)

357611
Training embeddings took 17201.172611951828 seconds


In [53]:
tigray_bert = create_bert_embedding(df_tigray, words_to_cluster_tigray)

108610
Training embeddings took 6313.1803567409515 seconds


In [54]:
rohingya_bert = create_bert_embedding(df_rohingya, words_to_cluster_rohingya)

68993
Training embeddings took 3697.816387653351 seconds


In [55]:
channel_bert = create_bert_embedding(df_channel, words_to_cluster_channel)

484502
Training embeddings took 23848.87373805046 seconds


In [218]:
pickle_file('greece_frame_embeddings', greece_bert)
pickle_file('tigray_frame_embeddings', tigray_bert)
pickle_file('rohingya_frame_embeddings', rohingya_bert)
pickle_file('channel_frame_embeddings', channel_bert)

In [219]:
pickle_file('greece_words_to_cluster', words_to_cluster_greece)
pickle_file('tigray_words_to_cluster', words_to_cluster_tigray)
pickle_file('rohingya_words_to_cluster', words_to_cluster_rohingya)
pickle_file('channel_words_to_cluster', words_to_cluster_channel)

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

bert_corpus = tweet_sentences + words_to_cluster

print(len(bert_corpus))
t0 = time()
document_embeddings = sbert_model.encode(bert_corpus)
print(f'Training embeddings took {time()-t0} seconds')

In [None]:
print(sentence_transformers.__version__)

In [78]:
words_embeddings = document_embeddings[len(tweet_sentences):]
len(words_embeddings)

NameError: name 'document_embeddings' is not defined

In [None]:
dict(zip(vectorizer.get_feature_names(), embeddings.toarray()[0]))

In [136]:
tweet_sentences_greece = [sent for tweet in df_greece['text_alphanum'] for sent in sent_tokenize(tweet)]

In [137]:
word_embeddings_greece = greece_bert[len(tweet_sentences_greece):]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = [word for word in vectorizer.get_feature_names() if word not in words_to_cluster]
vectorizer = TfidfVectorizer(stop_words = stopwords)

embeddings = vectorizer.fit_transform(unique_tweets_df['text_alphanum'])
print(vectorizer.get_feature_names())

In [None]:
from nltk.tokenize import sent_tokenize

tweet_sentences = [sent for tweet in unique_tweets_df['text_alphanum'] for sent in sent_tokenize(tweet)]
len(tweet_sentences)

In [None]:
from time import time
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

bert_corpus = tweet_sentences + words_to_cluster

print(len(bert_corpus))
t0 = time()
document_embeddings = sbert_model.encode(bert_corpus)
print(f'Training embeddings took {time()-t0} seconds')

In [None]:
words_embeddings = document_embeddings[len(tweet_sentences):]
len(words_embeddings)

In [None]:
dict(zip(vectorizer.get_feature_names(), embeddings.toarray()[0]))

## Factor analysis

In [None]:
words_embeddings = load_pickle('greece_bert_embeddings')

In [138]:
words_embeddings = word_embeddings_greece

In [148]:
vector_df.isna().sum()

0

In [140]:
vector_dict =dict()
for word,vector in zip(words_to_cluster_greece,words_embeddings):
    vector_dict[word] = vector
    
vector_df = pd.DataFrame.from_dict(vector_dict)

In [151]:
chi_square_value, p_value=calculate_bartlett_sphericity(vector_df)
chi_square_value, p_value

  statistic = -np.log(corr_det) * (n - 1 - (2 * p + 5) / 6)
  return sc.xlogy(df/2.-1, x) - x/2. - sc.gammaln(df/2.) - (np.log(2)*df)/2.


(inf, nan)

In [141]:
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(vector_df)
kmo_model

nan

In [142]:
kmo_all

array([nan, nan, nan, ..., nan, nan, nan])

# !

In [152]:
from factor_analyzer import FactorAnalyzer
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer()
fa.fit(vector_df)
eigen_values, vectors = fa.get_eigenvalues()
ev, v = fa.get_eigenvalues()
ev

array([ 6.98170822e+02,  6.15643496e+01,  2.74392388e+01, ...,
       -6.80665489e-15, -7.23782721e-15, -1.31274389e-14])

In [156]:
fa = FactorAnalyzer(8,rotation='varimax')
fa.fit(vector_df)
fa.loadings_

array([[ 0.5591498 ,  0.26232961,  0.1421388 , ...,  0.52135674,
         0.1564291 , -0.08579294],
       [ 0.80202759,  0.2402963 ,  0.42277575, ...,  0.10855167,
         0.14933181,  0.00148477],
       [ 0.45446674,  0.75851588,  0.1387505 , ...,  0.17084086,
         0.00321452,  0.17205969],
       ...,
       [ 0.26801312,  0.4440313 ,  0.03012718, ...,  0.13391058,
         0.17683644, -0.04121899],
       [ 0.65853832,  0.3494775 ,  0.23577034, ...,  0.09016037,
         0.05169822, -0.01761102],
       [-0.01549855,  0.60395185,  0.2936716 , ...,  0.23852288,
        -0.16654519,  0.03617725]])

In [157]:
factor_dict =dict()
for word,factor in zip(words_to_cluster,fa.loadings_):
    factor_dict[word] = factor
    
factor_df = pd.DataFrame.from_dict(factor_dict)
factor_df

Unnamed: 0,million,right,crisis,stop,help,camp,child,year,back,illegal,...,funded,leverage,vehicle,weaponized,nine,naked,secure,alien,vessel,homeless
0,0.55915,0.802028,0.454467,0.467937,0.696474,0.428724,0.491324,0.56796,0.750028,0.189122,...,0.642741,0.566048,0.533897,0.480772,0.55638,0.089049,0.424473,0.268013,0.658538,-0.015499
1,0.26233,0.240296,0.758516,0.678734,0.310521,0.164318,0.286453,0.203065,0.441203,0.788052,...,0.26276,0.480219,0.319887,0.464563,0.261044,0.587798,0.128603,0.444031,0.349477,0.603952
2,0.142139,0.422776,0.138751,0.087723,0.414351,0.225711,0.30222,0.172113,0.153584,0.274067,...,0.525228,0.346378,0.236265,0.290491,0.242364,0.206138,0.764889,0.030127,0.23577,0.293672
3,0.069198,-0.052656,0.16872,0.083585,0.260511,0.472473,0.369387,0.206922,0.133085,0.063767,...,0.172444,0.295814,0.491304,0.322113,0.078327,-0.024997,0.162073,0.170827,0.318039,0.304332
4,0.095338,0.126932,0.067339,0.414472,0.019238,0.135389,0.062252,0.128646,0.2033,0.299587,...,-0.048517,-0.094704,0.089619,0.049355,0.266255,0.6147,0.171752,0.246698,0.080963,0.083757
5,0.521357,0.108552,0.170841,0.084425,0.001089,0.199444,0.027368,0.52576,0.230387,0.050137,...,0.227349,0.052107,0.029504,0.028232,0.367207,0.040128,0.020652,0.133911,0.09016,0.238523
6,0.156429,0.149332,0.003215,-0.01361,0.021649,-0.128033,0.090705,-0.035445,0.049393,-0.015276,...,0.07077,0.148605,0.219462,0.274342,0.112562,0.057057,0.180282,0.176836,0.051698,-0.166545
7,-0.085793,0.001485,0.17206,-0.127447,0.109134,-0.028697,0.183926,0.091577,0.016447,0.12388,...,0.010893,0.215381,0.072823,0.150118,0.047483,-0.140126,0.061309,-0.041219,-0.017611,0.036177


In [168]:
factor_df_transposed = factor_df.T 
factor_df_transposed[factor_df_transposed[0]>0.6]

Unnamed: 0,0,1,2,3,4,5,6,7
right,0.802028,0.240296,0.422776,-0.052656,0.126932,0.108552,0.149332,0.001485
help,0.696474,0.310521,0.414351,0.260511,0.019238,0.001089,0.021649,0.109134
back,0.750028,0.441203,0.153584,0.133085,0.203300,0.230387,0.049393,0.016447
open,0.744733,0.195558,0.384544,0.104071,0.117539,0.029671,0.262515,-0.139846
news,0.613226,0.363217,0.261500,0.173573,-0.027101,0.243353,0.040545,-0.013139
...,...,...,...,...,...,...,...,...
couple,0.611650,0.216561,0.399804,0.270981,0.043119,0.145314,-0.047949,0.045120
gather,0.787129,0.264692,0.233996,0.273640,0.108069,0.098242,0.100004,-0.055289
grow,0.684127,0.225808,0.238009,0.271731,0.086286,0.230837,0.218085,-0.006097
funded,0.642741,0.262760,0.525228,0.172444,-0.048517,0.227349,0.070770,0.010893


## Clustering frame modeling

In [None]:
from time import time
import umap.umap_ as umap
start = time()
reducer = umap.UMAP(random_state=42,n_components=3)
reduced_embedding = reducer.fit_transform(words_embeddings)
print(f'Duration: {time() - start} seconds')

In [None]:
from sklearn.cluster import DBSCAN
start = time()
cluster_labels = DBSCAN(min_samples=6).fit_predict(reduced_embedding)
print(f'Duration: {time() - start} seconds')


In [None]:
from sklearn.cluster import AffinityPropagation

start = time()
cluster_labels = AffinityPropagation().fit_predict(reduced_embedding)
print(f'Duration: {time() - start} seconds')

In [None]:
from sklearn.cluster import KMeans
k_clusters = KMeans(n_clusters=8, random_state=42).fit_predict(reduced_embedding)

In [None]:
import seaborn

seaborn.scatterplot(x = reducer.embedding_[:, 0],
                y = reducer.embedding_[:, 1],
                hue = cluster_labels, palette ="Paired")

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

x = reducer.embedding_[:, 0]
y = reducer.embedding_[:, 1]
z = reducer.embedding_[:, 2]

ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")

ax.scatter(x, y, z, c = cluster_labels)

plt.show()

In [None]:
labeled_tweets = pd.DataFrame({'word': words_to_cluster,'label':cluster_labels})

# Create documents per label
docs_per_class = labeled_tweets.groupby(['label'], as_index=False).agg({'word': ' '.join})

words_per_class = dict()
for label,word in zip(docs_per_class['label'],docs_per_class['word']):
    words_per_class[label] = word.split(' ')


In [None]:
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in words_per_class.items() ])).fillna('.')

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans, DBSCAN
def kmean_test_n_clusters(data, n_clusters):
    """
    Takes the document vectors and the maximum amount of clusters to look for. 
    Performs KMeans algorithm on the dataset for each amount of clusters. 
    Calculates silhouette score and interias for each amount of clusters. 
    Plots the scores as a function of the amount of clusters.
    
    Arguments: 
    data -- document vectors as numpy matrices
    n_clusters -- integer that determines the maximum amount of clusters to test
    
    Returns: 
    Prints the scores as functions of the clusters in range 1, n_clusters
    """
    n_clusters += 1
    kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(data) for k in tqdm(range(1, n_clusters))]
    print("clusters done")
    inertias = [model.inertia_ for model in kmeans_per_k]
    print("inertias done")
    silhouette_scores = [silhouette_score(data, model.labels_)
                         for model in tqdm(kmeans_per_k[1:])]
    print("silhouettes done")

    fig, (ax1, ax2) = plt.subplots(2,1, figsize=(8, 3.5))

    ax1.plot(range(1, n_clusters), inertias, "bo-")
    ax1.set_xlabel("$k$", fontsize=14)
    ax1.set_ylabel("Inertia", fontsize=14)
    #ax1.annotate('Elbow',
    #             xy=(4, inertias[3]),
    #             xytext=(0.55, 0.55),
    #             textcoords='figure fraction',
    #             fontsize=16,
    #             arrowprops=dict(facecolor='black', shrink=0.1)
    #            )
    ax2.plot(range(2, n_clusters), silhouette_scores, "bo-")
    ax2.set_xlabel("$k$", fontsize=14)
    ax2.set_ylabel("Silhouette score", fontsize=14)
    #plt.axis([2, 8, 0.3, 0.475])
    plt.show()

In [None]:
kmean_test_n_clusters(reduced_embedding, 30)

In [None]:
# train this model only after the first merging step to save both memory and time
words_to_cluster =  ["accept" ,        "ally"     ,      "army"     ,      "attack"     ,    "attacking"  ,    "authority"   ,   "benefit"       
,"billion"    ,    "blackmail"    ,  "block"   ,       "boat"    ,       "bomb"     ,      "bombing"  ,      "border"        
,"break"       ,   "build"   ,       "burden"    ,     "camp" ,          "care"      ,     "child",          "citizen"       
,"city"       ,    "civil"    ,      "civilian" ,      "clash" ,         "closed"         ,"coast" ,         "community"     
,"conflict"  ,     "control"   ,     "creating",       "crime"  ,        "criminal"      , "cross"  ,        "crossing"      
,"dead"     ,      "death"      ,    "defend" ,        "desperate",      "dictator"     ,  "displaced",      "door"          
,"economic",       "economy"     ,   "entering"       ,"entry"     ,     "family"      ,   "fear"      ,     "fence"         
,"fight"          ,"fighting"     ,  "fire"          , "fled"       ,    "flee"       ,    "fleeing"    ,    "flood"         
, "flow"         ,  "food"         ,  "force"       ,   "forced"     ,    "foreign"  ,      "friend"     ,    "game"          
 ,"gate"        ,   "government"    , "guard"      ,    "health"      ,   "help"    ,       "helping"     ,   "history"       
, "hold"       ,    "hope"           ,"host"      ,     "hosting"      ,  "house"  ,        "human"        ,  "humanitarian"  
,"humanity"   ,    "hundred"  ,      "illegal"   ,     "illegally"      ,"influx" ,        "innocent"       ,"invade"        
, "invader"  ,      "invading" ,      "invasion",       "islamic"        ,"jihadist"      , "kid" ,           "kill"          
,"killed"   ,      "killing"    ,    "leaving" ,       "legal"   ,       "march"         , "mass"  ,         "military"      
, "million"       , "minister"   ,    "money"         , "movement",       "national"    ,   "number",         "official"      
, "opening"      ,  "order"       ,   "peace"        ,  "picture"  ,      "police"     ,    "policy" ,        "population"    
, "power"       ,   "pressure"     ,  "prevent"     ,   "protect"   ,     "protection",     "pushing" ,       "refuge"        
,"regime"      ,   "region"         ,"respect"     ,   "responsibility", "return"    ,     "risk"      ,     "rule"          
,"safety"     ,    "school"    ,     "security"   ,    "shelter" ,       "shooting" ,      "shot"       ,    "social"        
,"soldier"      ,  "solidarity" ,    "solution"  ,     "suffering",      "support" ,       "supporting"  ,   "tension"       
, "territory"  ,    "terrorist"  ,    "thousand",       "threat"   ,      "travel",         "troop"       ,   "violence"      
, "war"       ,     "wave"        ,   "weapon" ,        "woman"     ,     "work" ,          "worker"       ,  "working"       
, "zone"]

print(unique_tweets_df['text_alphanum'].shape)
len(words_to_cluster)