# Massive change testing

In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import hdbscan
import umap
import re
from gensim.test.utils import datapath
import gensim.downloader as api
from sklearn.ensemble import RandomForestRegressor
from topic_extractor import TopicExtractor
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
wv = api.load('word2vec-google-news-300')

In [3]:
vul_list = ['temperature', 'weather', 'soil', 'employment', 'vaccine', 'education', 'jobs', 'income', 'discrimination', 
'racism', 'ethnicity', 'minorities', 'orientation', 'immigrants', 'disabled', 'women', 'attitudes', 'imports', 'production', 'irrigation',
'economic', 'rain', 'water']
acc_list = ['price', 'cost', 'expensive', 'gdp', 'food', 'distance', 'rural']
sho_list = ['earthquake', 'disaster', 'conflict', 'war', 'politics', 'social', 'storms', 'volcanoes']
ava_list = ['enough', 'aid', 'share', 'amount', 'donations', 'grants', 'market']

vul_df = pd.DataFrame({'pillar': 'vulnerability', 'target_words': vul_list})
acc_df = pd.DataFrame({'pillar': 'access', 'target_words': acc_list})
sho_df = pd.DataFrame({'pillar': 'shocks', 'target_words': sho_list})
ava_df = pd.DataFrame({'pillar': 'availability', 'target_words': ava_list})

pdList = [vul_df, acc_df, sho_df, ava_df]
nom_cluster_words = pd.concat(pdList)
nom_cluster_words['word'] = [np.array(wv.most_similar(row , topn = 10)).T[0] for row in nom_cluster_words['target_words']]

nom_cluster_words = nom_cluster_words.explode('word').reset_index(drop = True)

In [4]:
raw_data = pd.read_csv('Iraq Qual Analyses.csv').loc[:,:'P4.2']
raw_data['text'] = [row for row in raw_data[raw_data.columns[1:]].values]
raw_data = raw_data[10:14]
raw_data.drop(raw_data.columns[1:-1],axis = 1,inplace=True)
raw_data = raw_data.explode('text').dropna()
raw_data

Unnamed: 0,Prompt,text
10,2.4\tWhat would you say are the main reasons w...,"Having no income, when those families do not h..."
10,2.4\tWhat would you say are the main reasons w...,"As I said the poor people, they are poor becau..."
10,2.4\tWhat would you say are the main reasons w...,Economic state i think; Sometimes social probl...
10,2.4\tWhat would you say are the main reasons w...,"I think the leading cause is that, the governm..."
10,2.4\tWhat would you say are the main reasons w...,? I‘d say this goes back the city not having j...
...,...,...
13,2.5 Cause 3:,Having an illness
13,2.5 Cause 3:,Cause 3: Social reasons\nCause 4: Educational ...
13,2.5 Cause 3:,And lastly not having a job
13,2.5 Cause 3:,Cause 3: Not having an active economy\nCause 4...


In [5]:
np.linalg.norm(wv['opportunities'])**2

8.584315002489348

In [6]:
word_list = raw_data.text.str.split(' ').explode()
word_list = word_list.str.replace(',','').str.replace('.','').str.lower()
stop_words = [("don't",.34),('reasons',.34),('foods',.6),('unable',.3),('citizens',.4),('my',.3),('hunger',.3),('cause',.2),('factories',.3),('leads',.3),('expired',.3),('living',.4),('low',.2)]
for stop_word, similarity in stop_words:
    word_list = word_list[[wv.similarity(word.lower(), stop_word)< similarity if word in wv else True for word in word_list]]
print('children' in word_list.to_list())

  word_list = word_list.str.replace(',','').str.replace('.','').str.lower()


True


In [7]:
topic_extractorer = TopicExtractor(wv,0)

In [8]:
topic_extractorer.load_seed_clusters(seed_words=nom_cluster_words['word'],known_labels=nom_cluster_words['pillar'],threshold=7)
topic_extractorer.load_words(word_list,6)

In [9]:
topic_extractorer.view_clusters()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['pillar'] = temp.pillar.replace(np.NaN,'None')


In [10]:
raw_data = pd.read_csv('Iraq Qual Analyses.csv').loc[:,:'P4.2']
responses = raw_data.C2.str.replace(',','').str.replace('.','')
print(responses[10:14])
word_list = responses[10:14].str.split(" ").explode()

10    As I said the poor people they are poor becaus...
11                                         Unemployment
12                                             Poverty 
13                       Neglecting Agriculture fields 
Name: C2, dtype: object



The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [11]:
sample = topic_extractorer.pre_trained.word.sample(200)
sims = np.array([wv.similarity(word1,word2) for i,word1 in enumerate(sample[:-1]) for word2 in sample[i+1:]])
sims.sort()
median_sim = sims[sims.shape[0]//2]

In [12]:
raw_data.columns

Index(['Prompt', 'C1', 'C2', 'C3', 'P5.2', 'P3.2', 'P1', 'C7', 'C8', 'C9',
       'C1.2', 'C2.2', 'C3.2', 'C4', 'C4.2', 'C5', '5.2', 'C6', 'C6.2', 'C7.2',
       'C8.2', 'C9.2', 'C10', 'C10.2', 'P2', 'P3', 'P4', 'P5', 'P1.2', 'P2.2',
       'P4.2'],
      dtype='object')

In [13]:
mental_models = {}
for participant in raw_data.columns[1:]:
    responses = raw_data[participant].str.replace(',','').str.replace('.','').str.lower()
    word_list = responses[10:14].str.split(" ").explode()
    stop_words = [("don't",.34),('reasons',.34),('foods',.6),('unable',.3),('citizens',.4),('my',.3),('hunger',.3),('cause',.2),('factories',.3),('leads',.3),('expired',.3),('living',.4),('low',.2)]
    for stop_word, similarity in stop_words:
        word_list = word_list[[wv.similarity(word.lower(), stop_word)< similarity if word in wv else True for word in word_list]]
    
    overall_clusters = topic_extractorer.get_current_vecs()
    overall_clusters = overall_clusters[overall_clusters.pillar.isna()]
    mental_model = []
    for word in word_list:
        if word in overall_clusters.word.to_list():
            mental_model.append(overall_clusters[overall_clusters.word == word].iloc[0])
    mental_model = pd.DataFrame(mental_model)
    
    coalescence = []
    for label in sorted(mental_model.labels.unique()):
        words = mental_model[mental_model.labels == label].word.to_numpy()
        location = mental_model[mental_model.labels == label].loc[:,"v0":"v299"].mean(axis=0).to_numpy()
        tally = 0
        num_entries = 0
        for i,word1 in enumerate(words[:-1]):
            for word2 in words[i+1:]:
                #print(word1,word2)
                tally += wv.similarity(word1,word2) - median_sim # adjusting for the median similarity between randomly picked words
                num_entries += 1
        if num_entries == 0: num_entries += 1
        coalescence.append({'cluster':label,'strength':tally/num_entries*len(words), 'examples': list(set(words))[:3],'location':location})
    try:
        coalescence_df = pd.DataFrame(coalescence).sort_values(by='strength',ascending=False)
        print(f"{participant}'s mental model:")
        print(coalescence_df)
        mental_models[participant] = coalescence_df
    except Exception:
        print(f"{participant}'s mental model was not detectable")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



C1's mental model:
   cluster  strength    examples  \
0        9  4.398752  [children]   

                                            location  
0  [0.012939453, 0.0016098022, -0.04321289, 0.177...  
C2's mental model:
   cluster  strength                    examples  \
1        4  0.000000                     [money]   
2       18  0.000000               [agriculture]   
3       27  0.000000              [unemployment]   
0       -1 -0.093495  [grow, fields, neglecting]   

                                            location  
1  [0.15820312, 0.05126953, 0.06640625, 0.2109375...  
2  [-0.034179688, -0.021728516, -0.04296875, -0.0...  
3  [0.3984375, -0.045166016, -0.265625, 0.1982421...  
0  [0.0133463545, 0.23860677, 0.060221355, -0.062...  
C3's mental model:
   cluster  strength    examples  \
1       32  1.759501  [economic]   
0       31  0.000000    [social]   

                                            location  
1  [0.051757812, 0.003753662, -0.125, 0.032226562...  
0  [0


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to Fa

C1.2's mental model:
   cluster  strength                         examples  \
1        0  2.639251                    [opportunity]   
3       25  2.639251                            [job]   
2       18  2.384640      [agricultural, agriculture]   
5       32  1.759501                       [economic]   
0       -1  0.191559  [preparing, economics, sectors]   
4       29  0.000000                         [market]   
6       33  0.000000                      [financial]   

                                            location  
1  [-0.025634766, 0.20410156, 0.044189453, -0.007...  
3  [-0.014587402, -0.048339844, -0.13671875, -0.1...  
2  [-0.060872395, 0.06437174, -0.025227865, 0.007...  
5  [0.051757812, 0.003753662, -0.125, 0.032226562...  
0  [-0.0045700073, 0.09932709, 0.11424255, 0.0610...  
4  [-0.15625, -0.087890625, -0.22949219, -0.23144...  
6  [-0.09277344, -0.024414062, -0.14550781, -0.01...  
C2.2's mental model:
   cluster  strength                   examples  \
3       33


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.




C4.2's mental model:
   cluster  strength               examples  \
1       31  0.000000               [social]   
0       -1 -0.001679  [cultivation, aspect]   

                                            location  
1  [0.099121094, -0.09765625, -0.123535156, 0.163...  
0  [0.22607422, 0.1387024, 0.028686523, 0.0827331...  
C5's mental model:
   cluster  strength                     examples  \
1        0  1.759501              [opportunities]   
3       25  1.759501                        [job]   
2        7  0.000000               [geographical]   
0       -1 -0.437343  [himself, border, location]   

                                            location  
1  [-0.24414062, 0.27734375, -0.079589844, -0.073...  
3  [-0.014587402, -0.048339844, -0.13671875, -0.1...  
2  [0.022583008, 0.07861328, 0.015258789, 0.15039...  
0  [0.013224284, 0.08122762, 0.061604816, 0.05407...  
5.2's mental model:
   cluster  strength examples  \
0       25  2.639251    [job]   

                        


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to Fa

C7.2's mental model:
   cluster  strength                      examples  \
4       30  1.759501                   [political]   
3       25  1.759501                         [job]   
1        0  0.000000               [opportunities]   
2       20  0.000000                    [disaster]   
0       -1 -0.396100  [provider, manpower, active]   

                                            location  
4  [-0.028686523, 0.029296875, -0.0625, 0.3535156...  
3  [-0.014587402, -0.048339844, -0.13671875, -0.1...  
1  [-0.24414062, 0.27734375, -0.079589844, -0.073...  
2  [0.23339844, -0.0006713867, -0.050048828, 0.18...  
0  [-0.06995985, 0.06215922, 0.015842438, 0.06583...  
C8.2's mental model:
   cluster  strength     examples  \
2       26  1.759501    [jobless]   
0       -1  0.000000     [closed]   
1        7  0.000000  [geography]   

                                            location  
2  [0.49609375, 0.026000977, -0.104003906, 0.2158...  
0  [-0.041015625, 0.016723633, 0.21484375, 0


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to Fa

P3's mental model:
   cluster  strength                           examples  \
0       -1  0.307454  [unavailability, response, items]   
1       10  0.000000                           [salary]   
2       29  0.000000                          [markets]   

                                            location  
0  [-0.04616928, 0.2263794, -0.121520996, 0.08703...  
1  [0.3984375, -0.056152344, -0.16894531, 0.18359...  
2  [-0.052001953, -0.08544922, -0.13574219, 0.106...  
P4's mental model:
   cluster  strength      examples  \
0        6  1.759501  [disability]   
1       32  1.759501    [economic]   

                                            location  
0  [0.19726562, -0.1484375, -0.15820312, 0.1875, ...  
1  [0.051757812, 0.003753662, -0.125, 0.032226562...  
P5's mental model:
   cluster  strength         examples  \
0        0  1.759501  [opportunities]   
1       25  1.759501            [job]   

                                            location  
0  [-0.24414062, 0.27734375


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



P4.2's mental model:
   cluster  strength                      examples  \
5       27  5.278502                [unemployment]   
4       10  4.055685            [salaries, salary]   
2        4  1.759501                       [money]   
1        2  0.000000                     [receive]   
3        8  0.000000                       [birds]   
0       -1 -0.143925  [society, skills, seriously]   

                                            location  
5  [0.3984375, -0.045166016, -0.265625, 0.1982421...  
4  [0.24140625, 0.009472656, -0.020996094, 0.2265...  
2  [0.15820312, 0.05126953, 0.06640625, 0.2109375...  
1  [0.052001953, -0.22167969, -0.21191406, 0.0996...  
3  [0.07324219, 0.18261719, -0.33984375, -0.02404...  
0  [-0.05913086, 0.08886719, -0.105773926, 0.0845...  


In [14]:
mm2 = mental_models['P5.2'].query('cluster>-1')
mm1 = mental_models['C2'].query('cluster>-1')

def compare_mental_models(mm1,mm2):
    mm1_filtered = mm1.query('cluster>-1')
    mm2_filtered = mm2.query('cluster>-1')
    if mm1_filtered.empty or mm2_filtered.empty:
        return 0
    return np.mean([max([np.dot(row1.location,row2.location)/np.linalg.norm(row1.location)/np.linalg.norm(row2.location) for _,row2 in mm2_filtered.iterrows()]) for _,row1 in mm1_filtered.iterrows()])

print(compare_mental_models(mm1,mm2))
[[(f"{row1.examples} -> {row2.examples}",np.dot(row1.location,row2.location)/np.linalg.norm(row1.location)/np.linalg.norm(row2.location)) for _,row2 in mm2.iterrows()] for _,row1 in mm1.iterrows()]


0.7698261


[[("['money'] -> ['crop', 'crops', 'wheat']", 0.13513015),
  ("['money'] -> ['received', 'receive']", 0.0568099),
  ("['money'] -> ['chickens', 'ducks', 'geese']", 0.15346186),
  ("['money'] -> ['experience']", 0.11808307),
  ("['money'] -> ['care']", 0.18036431),
  ("['money'] -> ['buy']", 0.31760776),
  ("['money'] -> ['rain']", 0.11101911),
  ("['money'] -> ['financial']", 0.27252272),
  ("['money'] -> ['agriculture']", 0.110064715),
  ("['money'] -> ['medications']", 0.17193604),
  ("['money'] -> ['money']", 1.0),
  ("['money'] -> ['water']", 0.24552587),
  ("['money'] -> ['job']", 0.24213025)],
 [("['agriculture'] -> ['crop', 'crops', 'wheat']", 0.5422642),
  ("['agriculture'] -> ['received', 'receive']", 0.044078436),
  ("['agriculture'] -> ['chickens', 'ducks', 'geese']", 0.20737456),
  ("['agriculture'] -> ['experience']", 0.0825472),
  ("['agriculture'] -> ['care']", 0.120561525),
  ("['agriculture'] -> ['buy']", -0.025091732),
  ("['agriculture'] -> ['rain']", 0.13316438),
  

In [15]:
results = []
for name1, mm1 in mental_models.items():
    row = {'name':name1}
    for name2, mm2 in mental_models.items():
        row[name2] = min(compare_mental_models(mm1, mm2),compare_mental_models(mm2, mm1))
    results.append(row)
    
similarity_df = pd.DataFrame(results).set_index('name')

In [16]:
name_set = set(pd.DataFrame(results).name)
groups = []
for name1, row in similarity_df.iterrows():
    if name1 in name_set:
        group = {name1}
        for name2 in row.index:
            if row[name2]>.5 and name1 != name2 and name2 in name_set:
                print(f'{name1} and {name2} agree')
                group.add(name2)
                name_set.remove(name2)
        name_set.remove(name1)
        groups.append(group)
groups

C2 and C2.2 agree
C2 and P4.2 agree
C3 and C4.2 agree
C3 and C10.2 agree
C3 and P4 agree
P3.2 and C1.2 agree
P3.2 and C3.2 agree
P3.2 and C4 agree
P3.2 and C5 agree
P3.2 and C6.2 agree
P3.2 and C10 agree
C7 and C8 agree
C7 and C9 agree
C7 and 5.2 agree
C7 and C6 agree
C7 and C9.2 agree
C7 and P2 agree
C7 and P5 agree
C7 and P2.2 agree
C7.2 and P1.2 agree


[{'C1'},
 {'C2', 'C2.2', 'P4.2'},
 {'C10.2', 'C3', 'C4.2', 'P4'},
 {'P5.2'},
 {'C1.2', 'C10', 'C3.2', 'C4', 'C5', 'C6.2', 'P3.2'},
 {'P1'},
 {'5.2', 'C6', 'C7', 'C8', 'C9', 'C9.2', 'P2', 'P2.2', 'P5'},
 {'C7.2', 'P1.2'},
 {'C8.2'},
 {'P3'}]

In [17]:
for i, group in enumerate(groups):
    print(f'group {i}:')
    for item in group:
        if len(mental_models[item].query('cluster>-1').index)>0:
            print(f'\t {item}')
            print(mental_models[item].query('cluster>-1'))

group 0:
	 C1
   cluster  strength    examples  \
0        9  4.398752  [children]   

                                            location  
0  [0.012939453, 0.0016098022, -0.04321289, 0.177...  
group 1:
	 C2.2
   cluster  strength       examples  \
3       33  3.519001    [financial]   
1       12  0.000000       [crisis]   
2       18  0.000000  [agriculture]   

                                            location  
3  [-0.09277344, -0.024414062, -0.14550781, -0.01...  
1  [0.11669922, 0.03466797, -0.11816406, 0.4375, ...  
2  [-0.034179688, -0.021728516, -0.04296875, -0.0...  
	 P4.2
   cluster  strength            examples  \
5       27  5.278502      [unemployment]   
4       10  4.055685  [salaries, salary]   
2        4  1.759501             [money]   
1        2  0.000000           [receive]   
3        8  0.000000             [birds]   

                                            location  
5  [0.3984375, -0.045166016, -0.265625, 0.1982421...  
4  [0.24140625, 0.009472656,

In [18]:
compare_mental_models(mental_models['C1.2'],mental_models['C10'])

0.41913477

In [19]:
qual_codes = pd.read_csv('Unprompted Causes Codes.csv').loc[:,'C1':'P4.2']
topicsLU = {}
for participant in qual_codes.columns:
    topic_list = qual_codes[participant].dropna().to_list()
    topicsLU[participant] = topic_list
unique_topics = set()
for topics in topicsLU.values():
    for topic in topics:
        unique_topics.add(topic.strip().lower().replace('.','').replace('isolatin','isolation').replace('isolation','').replace('geogrpahic','geographic').
        replace('unemployement','unemployment').replace('/poverty','').replace('neglecting ','').replace('high cost of food','cost').replace(' conditions',''))
unique_topics

{'agriculture',
 'cost',
 'economic',
 'family planning',
 'food availability',
 'food management',
 'geographic ',
 'government',
 'health',
 'natural causes',
 'personal character',
 'poverty',
 'social reasons',
 'unemployment',
 'unfair society'}

In [20]:
unprompted_clusters = topic_extractorer.get_current_vecs()[topic_extractorer.get_current_vecs().pillar.isna()]
# for label in sorted(unprompted_clusters.labels.unique()):
#     words = unprompted_clusters.query(f'labels=={label}').word.unique()[:3]
#     topic_location = unprompted_clusters.loc[:,"v0":"v299"].mean(axis=0).to_numpy()
#     for topic in unique_topics:
        

In [21]:
latent_vars_map = {
    'protein supply':['meat','animals','chicken','beef','pork','poultry','cow','pig','protein','duck'],
    'energy supply' :['corn','wheat','barley','grain','oats','harvest','crops','agriculture'],
    'food supply': ['meat','animals','chicken','beef','pork','poultry','cow','pig','duck','corn','wheat','barley','grain','oats','harvest','crops','agriculture'],
    'political stability':['riots', 'protests', 'war', 'crisis', 'instability', 'unrest','disruption','coup','assasination','politics','government'],
    'armed conflict': ['war','riots','militia','terrorist','conflict','armed'],
    'floods': ['flood','floods','rain','weather'],
    'droughts':['dry','drought','rain','droughts','arid'],
    'earthquakes':['earthquakes','tremors'],
    'cyclones': ['hurricane','typhoon','cyclone'],
    'disease' : ['illness','sick','flu','vaccine','malaria','aids','disease','medications','meds','medicine'],
    'income': ['income','job','joblessness','unemployment','jobs','money','wage'],
    'prices': ['prices','expensive','cost','costly'],
    'climate' :['rain','weather','season','dry','arid'],
    'inclusivity' : ['disabled','illness','disability'],
}

In [22]:
unprompted_clusters.query('labels > -1')

Unnamed: 0,word,pillar,v0,v1,v2,v3,v4,v5,v6,v7,...,v292,v293,v294,v295,v296,v297,v298,v299,mag,labels
323,money,,0.158203,0.051270,0.066406,0.210938,0.035156,-0.004669,-0.004456,-0.082031,...,-0.060303,-0.125977,0.062988,0.049072,0.124023,-0.080566,-0.056396,-0.078613,6.031805,4
324,money,,0.158203,0.051270,0.066406,0.210938,0.035156,-0.004669,-0.004456,-0.082031,...,-0.060303,-0.125977,0.062988,0.049072,0.124023,-0.080566,-0.056396,-0.078613,6.031805,4
325,money,,0.158203,0.051270,0.066406,0.210938,0.035156,-0.004669,-0.004456,-0.082031,...,-0.060303,-0.125977,0.062988,0.049072,0.124023,-0.080566,-0.056396,-0.078613,6.031805,4
326,money,,0.158203,0.051270,0.066406,0.210938,0.035156,-0.004669,-0.004456,-0.082031,...,-0.060303,-0.125977,0.062988,0.049072,0.124023,-0.080566,-0.056396,-0.078613,6.031805,4
327,money,,0.158203,0.051270,0.066406,0.210938,0.035156,-0.004669,-0.004456,-0.082031,...,-0.060303,-0.125977,0.062988,0.049072,0.124023,-0.080566,-0.056396,-0.078613,6.031805,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,unemployment,,0.398438,-0.045166,-0.265625,0.198242,0.398438,-0.009338,-0.166992,-0.082031,...,-0.102539,-0.015381,0.039551,0.314453,-0.179688,0.164062,0.058838,0.012573,15.340950,27
677,jobless,,0.496094,0.026001,-0.104004,0.215820,0.228516,0.213867,-0.227539,-0.028320,...,-0.021729,-0.251953,0.062012,0.078125,-0.157227,0.251953,-0.040527,0.025391,15.455056,26
678,jobless,,0.496094,0.026001,-0.104004,0.215820,0.228516,0.213867,-0.227539,-0.028320,...,-0.021729,-0.251953,0.062012,0.078125,-0.157227,0.251953,-0.040527,0.025391,15.455056,26
679,jobless,,0.496094,0.026001,-0.104004,0.215820,0.228516,0.213867,-0.227539,-0.028320,...,-0.021729,-0.251953,0.062012,0.078125,-0.157227,0.251953,-0.040527,0.025391,15.455056,26


In [23]:
sample = topic_extractorer.pre_trained.word.sample(1000)
sims = np.array([wv.similarity(word1,word2) for i,word1 in enumerate(sample[:-1]) for word2 in sample[i+1:]])
sims.sort()
median_sim = sims[sims.shape[0]//2]
coalescence = []
for label in sorted(unprompted_clusters.labels.unique()):
    words = unprompted_clusters[unprompted_clusters.labels == label].word.to_numpy()
    tally = 0
    num_entries = 0
    for i,word1 in enumerate(words[:-1]):
        for word2 in words[i+1:]:
            #print(word1,word2)
            tally += wv.similarity(word1,word2) - median_sim # adjusting for the median similarity between randomly picked words
            num_entries += 1
    if num_entries == 0: num_entries += 1
    coalescence.append({'cluster':label,'strength':tally/num_entries*len(words), 'examples': list(set(words))[:3]})
coalescence_df = pd.DataFrame(coalescence).sort_values(by='strength',ascending=False)
coalescence_df

Unnamed: 0,cluster,strength,examples
18,25,31.796455,[job]
1,0,12.258999,"[opportunity, opportunities, experience]"
20,27,9.715584,[unemployment]
5,4,7.949114,[money]
24,32,7.065879,[economic]
15,18,6.520283,"[agricultural, agriculture]"
25,33,6.182644,[financial]
16,19,5.619564,"[crop, crops, wheat]"
3,2,5.514252,"[receive, receiving, obtain]"
10,10,4.750502,"[salaries, salary]"


In [24]:
for key in latent_vars_map:
    coalescence_df[key] = [max(0,np.mean([((wv.similarity(word1,word2) - median_sim)/(1-median_sim)) for word2 in latent_vars_map[key] for word1 in words1])) for words1 in coalescence_df.examples]
coalescence_df

Unnamed: 0,cluster,strength,examples,protein supply,energy supply,food supply,political stability,armed conflict,floods,droughts,earthquakes,cyclones,disease,income,prices,climate,inclusivity
18,25,31.796455,[job],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338995,0.0,0.0,0.032694
1,0,12.258999,"[opportunity, opportunities, experience]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018104,0.0,0.0,0.0
20,27,9.715584,[unemployment],0.0,0.034321,0.0,0.072793,0.038521,0.097461,0.082664,0.077937,0.043921,0.026523,0.436022,0.011958,0.034313,0.134235
5,4,7.949114,[money],0.00979,0.0,0.004593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231398,0.126873,0.0,0.0
24,32,7.065879,[economic],0.0,0.074815,0.001991,0.116053,0.008262,0.084514,0.051772,0.034559,0.041371,0.0,0.242611,0.014831,0.031305,0.0
15,18,6.520283,"[agricultural, agriculture]",0.168442,0.403144,0.286338,0.012277,0.0,0.090378,0.164445,0.0,0.003401,0.0,0.088432,0.0,0.070659,0.0
25,33,6.182644,[financial],0.0,0.0,0.0,0.034291,0.0,0.0,0.0,0.0,0.031186,0.0,0.116196,0.0,0.0,0.0
16,19,5.619564,"[crop, crops, wheat]",0.137234,0.533502,0.325388,0.0,0.0,0.184479,0.229797,0.0,0.090899,0.027929,0.008572,0.0,0.148771,0.0
3,2,5.514252,"[receive, receiving, obtain]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006181
10,10,4.750502,"[salaries, salary]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26297,0.0527,0.0,0.020607


In [35]:
final_matrix = coalescence_df.copy()
for i,cluster in enumerate(final_matrix.cluster):
    row = final_matrix[final_matrix.cluster == cluster].iloc[0]
    if all(row.loc['protein supply':'inclusivity']<.1) and row.strength > 1:
        new_col = [0]*len(final_matrix)
        new_col[i] = 1
        final_matrix[cluster] = new_col
final_matrix

Unnamed: 0,cluster,strength,examples,protein supply,energy supply,food supply,political stability,armed conflict,floods,droughts,...,cyclones,disease,income,prices,climate,inclusivity,0,2,31,7
18,25,31.796455,[job],0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.338995,0.0,0.0,0.032694,0,0,0,0
1,0,12.258999,"[opportunity, opportunities, experience]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018104,0.0,0.0,0.0,1,0,0,0
20,27,9.715584,[unemployment],0.0,0.034321,0.0,0.072793,0.038521,0.097461,0.082664,...,0.043921,0.026523,0.436022,0.011958,0.034313,0.134235,0,0,0,0
5,4,7.949114,[money],0.00979,0.0,0.004593,0.0,0.0,0.0,0.0,...,0.0,0.0,0.231398,0.126873,0.0,0.0,0,0,0,0
24,32,7.065879,[economic],0.0,0.074815,0.001991,0.116053,0.008262,0.084514,0.051772,...,0.041371,0.0,0.242611,0.014831,0.031305,0.0,0,0,0,0
15,18,6.520283,"[agricultural, agriculture]",0.168442,0.403144,0.286338,0.012277,0.0,0.090378,0.164445,...,0.003401,0.0,0.088432,0.0,0.070659,0.0,0,0,0,0
25,33,6.182644,[financial],0.0,0.0,0.0,0.034291,0.0,0.0,0.0,...,0.031186,0.0,0.116196,0.0,0.0,0.0,0,0,0,0
16,19,5.619564,"[crop, crops, wheat]",0.137234,0.533502,0.325388,0.0,0.0,0.184479,0.229797,...,0.090899,0.027929,0.008572,0.0,0.148771,0.0,0,0,0,0
3,2,5.514252,"[receive, receiving, obtain]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.006181,0,1,0,0
10,10,4.750502,"[salaries, salary]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.26297,0.0527,0.0,0.020607,0,0,0,0
