In [4]:
from comparewikilist import similar
import pandas as pd
import numpy as np

def get_df(store):
    print(store)
    # Build dataframe based on all permutations of topics
    for i in range(len(store)):
        article1 = store[i]
        article2list = store[i+1:]
        if i == 0:
            df = pd.DataFrame(np.array(similar(article1,article2list, verbose=0).iterate()))
        else:
            df = pd.concat([df, pd.DataFrame(np.array(similar(article1,article2list, verbose=0).iterate()))], axis=0)

    df.columns = ['Topic 1', 'Topic 2', 'Probability', 'Similar?']
    df.index = range(df.shape[0])
    df['Probability'] = df['Probability'].astype('int')
            
    return df

# Old get_df function (below) that is not time efficient 
This is because it keeps retrieving wikipedia article store[i] when it does a new comparison <br>
Since article store[i] is locked per iteration of j loop, the new code from comparewikilist does not repeatedly retrieve the article and get top 40 words, but instead stores it for re-use <br>
Only when the new store[i] is required that a new comparison is made <br>
## New code (above) is more time efficient and does not require a second loop

In [13]:
# from comparewiki import similar
# import pandas as pd
# import numpy as np

# def get_df(store):
#     print(store)
#     # Build dataframe based on all permutations of topics
#     for i in range(len(store)):
#         for j in range(i+1, len(store)):
#             if i == 0 and j == 1:
#                 df = pd.DataFrame(pd.DataFrame(np.array(similar(store[i],store[j], verbose=0).ans())).transpose())
#             else:
#                 df = pd.concat([df, pd.DataFrame(np.array(similar(store[i],store[j], verbose=0).ans())).transpose()], axis=0)

#     df.columns = ['Topic 1', 'Topic 2', 'Probability', 'Similar?']
#     df.index = range(df.shape[0])
            
#     return df

In [14]:
store = ['Alexander the Great', 'Qin Shi Huang', 'Augustus', 'Charlemagne', 'Genghis Khan',
         'Adolf Hitler', 'Joseph Stalin', 'Winston Churchill', 'Mao Zedong', 'Nelson Mandela',
         "Confucius", "Socrates", "Plato", "Aristotle", "Adam Smith", "Immanuel Kant", "Karl Marx", 
         "Friedrich Nietzsche", "Sigmund Freud"]

In [15]:
# Get all political leader similarity
politics_vs_philos = get_df(store)
politics_vs_philos

['Alexander the Great', 'Qin Shi Huang', 'Augustus', 'Charlemagne', 'Genghis Khan', 'Adolf Hitler', 'Joseph Stalin', 'Winston Churchill', 'Mao Zedong', 'Nelson Mandela', 'Confucius', 'Socrates', 'Plato', 'Aristotle', 'Adam Smith', 'Immanuel Kant', 'Karl Marx', 'Friedrich Nietzsche', 'Sigmund Freud']


Unnamed: 0,Topic 1,Topic 2,Probability,Similar?
0,Alexander the Great,Qin Shi Huang,67,Yes
1,Alexander the Great,Augustus,98,Yes
2,Alexander the Great,Charlemagne,52,Yes
3,Alexander the Great,Genghis Khan,65,Yes
4,Alexander the Great,Adolf Hitler,40,No
...,...,...,...,...
166,Immanuel Kant,Friedrich Nietzsche,100,Yes
167,Immanuel Kant,Sigmund Freud,100,Yes
168,Karl Marx,Friedrich Nietzsche,73,Yes
169,Karl Marx,Sigmund Freud,61,Yes


In [16]:
# Sort dataframe from highest similarity probability to lowest
politics_vs_philos['Probability'] = politics_vs_philos['Probability'].astype('int')
politics_vs_philos = politics_vs_philos.sort_values('Probability', ascending=False)

In [17]:
# Save as CSV
politics_vs_philos.index.name = 'Index'
politics_vs_philos.to_csv('politics_vs_philos.csv', sep='|')

In [18]:
# politics_vs_philos = pd.read_csv('politics_vs_philos.csv', sep='|', index_col='Index')

In [19]:
politics_vs_philos.head()

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
170,Friedrich Nietzsche,Sigmund Freud,100,Yes
149,Socrates,Sigmund Freud,100,Yes
79,Genghis Khan,Sigmund Freud,100,Yes
80,Adolf Hitler,Joseph Stalin,100,Yes
82,Adolf Hitler,Mao Zedong,100,Yes


In [20]:
politics_vs_philos[politics_vs_philos['Similar?'] == 'Yes'].head(50)

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
170,Friedrich Nietzsche,Sigmund Freud,100,Yes
149,Socrates,Sigmund Freud,100,Yes
79,Genghis Khan,Sigmund Freud,100,Yes
80,Adolf Hitler,Joseph Stalin,100,Yes
82,Adolf Hitler,Mao Zedong,100,Yes
94,Joseph Stalin,Mao Zedong,100,Yes
36,Augustus,Genghis Khan,100,Yes
35,Augustus,Charlemagne,100,Yes
117,Mao Zedong,Confucius,100,Yes
26,Qin Shi Huang,Confucius,100,Yes


In [21]:
politics_vs_philos[politics_vs_philos['Similar?'] == 'No'].tail(50)

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32,Qin Shi Huang,Karl Marx,31,No
7,Alexander the Great,Mao Zedong,31,No
57,Charlemagne,Confucius,31,No
88,Adolf Hitler,Adam Smith,30,No
73,Genghis Khan,Plato,30,No
44,Augustus,Plato,28,No
124,Mao Zedong,Friedrich Nietzsche,28,No
58,Charlemagne,Socrates,28,No
56,Charlemagne,Nelson Mandela,27,No
30,Qin Shi Huang,Adam Smith,27,No


In [22]:
politics_vs_philos[(politics_vs_philos['Topic 1'] == 'Adolf Hitler') | (politics_vs_philos['Topic 2'] == 'Adolf Hitler')]

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
80,Adolf Hitler,Joseph Stalin,100,Yes
82,Adolf Hitler,Mao Zedong,100,Yes
37,Augustus,Adolf Hitler,79,Yes
92,Adolf Hitler,Sigmund Freud,68,Yes
66,Genghis Khan,Adolf Hitler,65,Yes
81,Adolf Hitler,Winston Churchill,61,Yes
91,Adolf Hitler,Friedrich Nietzsche,60,Yes
52,Charlemagne,Adolf Hitler,49,No
83,Adolf Hitler,Nelson Mandela,45,No
89,Adolf Hitler,Immanuel Kant,44,No


In [23]:
politics_vs_philos[(politics_vs_philos['Topic 1'] == 'Winston Churchill') | (politics_vs_philos['Topic 2'] == 'Winston Churchill')]

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
93,Joseph Stalin,Winston Churchill,77,Yes
81,Adolf Hitler,Winston Churchill,61,Yes
39,Augustus,Winston Churchill,56,Yes
68,Genghis Khan,Winston Churchill,49,No
115,Winston Churchill,Sigmund Freud,47,No
105,Winston Churchill,Mao Zedong,46,No
6,Alexander the Great,Winston Churchill,46,No
54,Charlemagne,Winston Churchill,45,No
114,Winston Churchill,Friedrich Nietzsche,42,No
106,Winston Churchill,Nelson Mandela,40,No


In [30]:
politics_vs_philos[(politics_vs_philos['Probability'] > 40) & (politics_vs_philos['Probability'] < 60)]

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
71,Genghis Khan,Confucius,59,Yes
163,Adam Smith,Friedrich Nietzsche,59,Yes
125,Mao Zedong,Sigmund Freud,58,Yes
153,Plato,Karl Marx,58,Yes
156,Aristotle,Adam Smith,58,Yes
130,Nelson Mandela,Adam Smith,57,Yes
39,Augustus,Winston Churchill,56,Yes
145,Socrates,Adam Smith,55,Yes
42,Augustus,Confucius,55,Yes
95,Joseph Stalin,Nelson Mandela,54,Yes


In [32]:
politics_vs_philos[(politics_vs_philos['Probability'] > 30) & (politics_vs_philos['Probability'] < 40)]

Unnamed: 0_level_0,Topic 1,Topic 2,Probability,Similar?
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
126,Nelson Mandela,Confucius,38,No
12,Alexander the Great,Aristotle,38,No
90,Adolf Hitler,Karl Marx,37,No
132,Nelson Mandela,Karl Marx,37,No
5,Alexander the Great,Joseph Stalin,36,No
10,Alexander the Great,Socrates,36,No
21,Qin Shi Huang,Adolf Hitler,35,No
70,Genghis Khan,Nelson Mandela,35,No
9,Alexander the Great,Confucius,35,No
77,Genghis Khan,Karl Marx,34,No


In [24]:
politics_vs_philos['Probability'].describe()

count    171.000000
mean      52.403509
std       28.484171
min       12.000000
25%       28.000000
50%       46.000000
75%       73.000000
max      100.000000
Name: Probability, dtype: float64

In [25]:
politics_vs_philos['Similar?'].value_counts()

No     92
Yes    79
Name: Similar?, dtype: int64

In [26]:
similar('Adolf Hitler','Karl Marx', verbose=2).ans()

Probability of topics being related is 37%
Count is 345 and sum is 128.0847447073091
['hitler.n.01', 'german.n.01', 'kershaw.n.01', 'war.n.01', 'hitlers.n.01', 'party.n.01', 'national.n.01', 'germany.n.01', 'nazi.n.01', 'were.n.01', 'world.n.01', 'von.n.01', '1999.n.01', 'adolf.n.01', 'had.n.01', '1960.n.01', 'shirer.n.01', 'york.n.01', 'socialist.n.01', 'london.n.01', 'military.n.01', 'political.n.01', 'press.n.01', 'bullock.n.01', 'union.n.01', 'reich.n.01', 'holocaust.n.01', 'army.n.01', 'against.n.01', 'identifiers.n.01', 'university.n.01', 'movement.n.01', 'history.n.01', 'government.n.01', 'munich.n.01', 'jews.n.01', 'fascist.n.01', 'nsdap.n.01', 'battle.n.01', 'power.n.01']


['marx.n.01', 'karl.n.01', 'engels.n.01', 'marxs.n.01', 'social.n.01', 'volume.n.01', 'political.n.01', 'economic.n.01', 'communist.n.01', 'philosophy.n.01', 'works.n.01', 'class.n.01', 'history.n.01', 'international.n.01', 'press.n.01', 'york.n.01', 'wheen.n.01', 'german.n.01', 'theory.n.01', 'society.n.01

['Adolf Hitler',
 'Karl Marx',
 37,
 'No',
 ['hitler.n.01',
  'german.n.01',
  'kershaw.n.01',
  'war.n.01',
  'hitlers.n.01',
  'party.n.01',
  'national.n.01',
  'germany.n.01',
  'nazi.n.01',
  'were.n.01',
  'world.n.01',
  'von.n.01',
  '1999.n.01',
  'adolf.n.01',
  'had.n.01',
  '1960.n.01',
  'shirer.n.01',
  'york.n.01',
  'socialist.n.01',
  'london.n.01',
  'military.n.01',
  'political.n.01',
  'press.n.01',
  'bullock.n.01',
  'union.n.01',
  'reich.n.01',
  'holocaust.n.01',
  'army.n.01',
  'against.n.01',
  'identifiers.n.01',
  'university.n.01',
  'movement.n.01',
  'history.n.01',
  'government.n.01',
  'munich.n.01',
  'jews.n.01',
  'fascist.n.01',
  'nsdap.n.01',
  'battle.n.01',
  'power.n.01'],
 ['marx.n.01',
  'karl.n.01',
  'engels.n.01',
  'marxs.n.01',
  'social.n.01',
  'volume.n.01',
  'political.n.01',
  'economic.n.01',
  'communist.n.01',
  'philosophy.n.01',
  'works.n.01',
  'class.n.01',
  'history.n.01',
  'international.n.01',
  'press.n.01',
  'yo

In [27]:
similar('Joseph Stalin','Nelson Mandela', verbose=2)

Probability of topics being related is 54%
Count is 291 and sum is 157.3901746822728
['service.n.01', 'stalin.n.01', '1991.n.01', 'montefiore.n.01', 'khlevniuk.n.01', 'conquest.n.01', 'soviet.n.01', 'kotkin.n.01', 'war.n.01', 'were.n.01', 'had.n.01', 'stalins.n.01', 'union.n.01', 'russian.n.01', 'volkogonov.n.01', 'party.n.01', 'lenin.n.01', 'although.n.01', 'communist.n.01', 'government.n.01', 'been.n.01', 'state.n.01', 'germany.n.01', 'georgian.n.01', 'civil.n.01', 'revolution.n.01', 'army.n.01', 'lenins.n.01', 'central.n.01', 'red.n.01', 'during.n.01', 'many.n.01', 'between.n.01', 'being.n.01', 'bolshevik.n.01', 'german.n.01', 'bolsheviks.n.01', 'according.n.01', 'moscow.n.01', 'soviets.n.01']


['mandela.n.01', 'sampson.n.01', 'meredith.n.01', '1994.n.01', 'lodge.n.01', 'south.n.01', 'smith.n.01', 'nelson.n.01', 'anc.n.01', 'african.n.01', '1986.n.01', 'benson.n.01', 'mandelas.n.01', 'africa.n.01', 'had.n.01', '1988.n.01', 'were.n.01', 'meer.n.01', 'although.n.01', 'president.n.01'

<comparewiki.similar at 0x1ab85a560c8>

In [28]:
similar('Joseph Stalin','Nelson Mandela', verbose=1)

Probability of topics being related is 54%
Count is 291 and sum is 157.3901746822728


<comparewiki.similar at 0x1ab88b728c8>