In [1]:
import wikionly #script name is wikionly (no summary), class name is wiki
import re as re
import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet
import math

#Input two Wikipedia articles to compute similarity percentage
class similar:
    def __init__(self,text1,text2,verbose=1):
        """To start, assign var = comparewiki.similar('arg1','arg2', verbose=1). 
        arg1 and arg2 are names of the wikipedia articles.
        verbose=1 prints the probability score and mathematical calculation. 
        verbose=2 additionally prints array of words for each article
        verbose=0 disables any logs.
        To get values in a list for storage, use .ans(). To get the 40 common words for comparison, use .words()"""

        self.wn = nltk.corpus.wordnet #the corpus reader
        self.verbose = verbose # Verbose/log level of detail

        #Error handling: check if both arguments input are string format
        checkstr = False
        if isinstance(text1, str) == True:
            if isinstance(text2, str) == True:
                self.text1 = text1
                self.text2 = text2
                checkstr = True
            else:
                print('Error! The second argument is not a string format!')        
        else:
            print('Error! The first argument is not a string format!')
        
        #Run internal wikipedia python file for processing for both wiki titles
        if checkstr == True:
            self.wiki1 = wikionly.wiki(text1)
            self.wiki2 = wikionly.wiki(text2)
            
        #Call the function that calculates percentage
        self.percent(self.wiki1,self.wiki2,self.verbose)
        
        #call the function that shows list of words for both Wiki sites
        #Only can be used if self.percent has been called and list/arrays for articles are created
        if self.verbose == 2:
            print(self.words())
        

    #Retrieve top 40 common words from wiki page, slice up and append .n01 for NLTK usage
    def percent(self,input1,input2,verbose):
        self.dotn01 = ('.','n','.','0','1')
        self.wiki1list = []
        for key in self.wiki1.commonwords(40):
            self.wiki1slice = list(key)
            for letter in self.dotn01:
                self.wiki1slice.append(letter)
            self.wiki1slice = ''.join(self.wiki1slice)
            self.wiki1list.append(self.wiki1slice)

        self.wiki2list = []
        for key in self.wiki2.commonwords(40):
            self.wiki2slice = list(key)
            for letter in self.dotn01:
                self.wiki2slice.append(letter)
            self.wiki2slice = ''.join(self.wiki2slice)
            self.wiki2list.append(self.wiki2slice)
        
        #count and sum for calculating similarity
        self.count = 0
        self.sum = 0
        #A count for the ranking of the word (how often it appears in both wiki passages)
        self.topten1 = 0
        self.topten2 = 0

        #For words that are 1-10th and 11-21st in popularity, if both wiki pages have the word, they get more points
        for word1 in self.wiki1list:
            #Reset self.topten2
            self.topten2 = 0
            self.topten1 += 1
            for word2 in self.wiki2list:
                self.topten2 += 1
                #reinitialize to zero to prevent old sums from going into maxsum
                self.sum1 = 0
                self.sum2 = 0
                self.sum3 = 0
                self.sum4 = 0
                self.maxsum = 0
                
                if self.topten1 < 11 and self.topten2 < 11:
                    self.expvalue = 4.5
                elif self.topten1 < 21 and self.topten2 < 21:
                    self.expvalue = 2.5
                else:
                    self.expvalue = 1.5
                
                #Main algorithm for calculating score of words
                try:
                    if re.findall(r"\d+.n.01", word1) == [] and re.findall(r"\d+.n.01", word2) == []: #check both words not numbers
                        #since words have many meanings, for every pair of words, use top two meanings n.01 and n.02 for comparison
                        #two for loops will check every permutation pair of words between wiki pages, two meanings for each word, 
                        #Take the max similarity value taken for computation of similarity index
                        #e.g. money.n.01 may have highest value with value.n.02 because value.n.01 has the obvious meaning of worth/significance and secondary for money
                        word11 = word1.replace('n.01','n.02')
                        word22 = word2.replace('n.01','n.02')
                        #print(word11,word22)
                        self.x = self.wn.synset(word1)
                        self.y = self.wn.synset(word2)
                        #get default similarity value of 1st definitions of word
                        self.sum1 = self.x.path_similarity(self.y) * math.exp(self.expvalue * self.x.path_similarity(self.y)) + 10 * math.log(0.885+self.x.path_similarity(self.y))
                        try: #get 2nd definitions of words and their similarity values, if it exist
                            self.xx = self.wn.synset(word11)
                            self.yy = self.wn.synset(word22)
                            self.sum2 = self.xx.path_similarity(self.y) * math.exp(self.expvalue * self.xx.path_similarity(self.y)) + 10 * math.log(0.89+self.xx.path_similarity(self.y))
                            self.sum3 = self.x.path_similarity(self.yy) * math.exp(self.expvalue * self.x.path_similarity(self.yy)) + 10 * math.log(0.89+self.x.path_similarity(self.yy))
                            self.sum4 = self.xx.path_similarity(self.yy) * math.exp(self.expvalue * self.xx.path_similarity(self.yy)) + 10 * math.log(0.89+self.xx.path_similarity(self.yy))
                        except:
                            continue
                        self.maxsum = max(self.sum1,self.sum2,self.sum3,self.sum4) #get the max similarity value between 2 words x 2 meanings = 4 comparisons
                        #print(word1, word2, self.maxsum)
                        self.sum += self.maxsum
                        self.count += 1
                except:
                    if word1 == word2 and re.findall(r"\d+.n.01", word1) == []: #remove years/numbers being counted as match yyyy.n.01
                        self.sum += math.exp(self.expvalue) + 10 * math.log(1.89)
                        self.count += 1
                    else:
                        continue

        #Print the results and implement ceiling if the percent exceeds 100% or drops below 0%
        if self.count != 0:
            self.pct = round(self.sum/self.count*100)
            if self.pct > 100:
                self.pct = 100
            elif self.pct < 0:
                self.pct = 0
            if self.verbose >= 1:
                print('Probability of topics being related is ' + str(self.pct) + '%')
                print('Count is ' + str(self.count) + ' and sum is ' + str(self.sum))
        else:
            if self.verbose >= 1:
                print('No relation index can be calculated as words are all foreign')
            
        return self.pct
        
    #Print out list of common words for both Wiki articles
    def words(self):
        print(self.wiki1list)
        print('\n')
        print(self.wiki2list)
        
    #Outputs list of results [Article 1, Article 2, Percentage, Yes/No] that can be put into a dataframe
    def ans(self):
        self.listans = [self.text1,self.text2,self.pct]
        if self.pct > 49:
            self.listans.append('Yes')
        else:
            self.listans.append('No')
        
        if self.verbose == 2:
            self.listans.append(self.wiki1list)
            self.listans.append(self.wiki2list)
        
        return self.listans
    
    def help(self):
        print("To start, assign var = comparewiki.similar('arg1','arg2', verbose=1). arg1 and arg2 are names of the wikipedia articles, while verbose=1 prints the probability score and mathematical calculation. verbose=2 additionally prints array of words for each article, and verbose=0 disables any logs. To get values in a list for storage, use .ans(). To get the 40 common words for comparison, use .words()")

In [2]:
aa = similar('Armin van Buuren','Tiesto')
aa.ans()

Probability of topics being related is 95%
Count is 511 and sum is 486.3398758941804


['Armin van Buuren', 'Tiesto', 95, 'Yes']

In [3]:
ab = similar('Armin van Buuren','paul van dyk')

Probability of topics being related is 100%
Count is 466 and sum is 574.5343295379461


In [4]:
ac = similar('Paul van dyk','Tiesto')

Probability of topics being related is 66%
Count is 443 and sum is 294.19174429833765


In [5]:
ad = similar('Martin Garrix','Tiesto')

Probability of topics being related is 66%
Count is 420 and sum is 278.9390123955344


In [6]:
ae = similar('Tiesto','Martin Garrix')

Probability of topics being related is 66%
Count is 420 and sum is 278.93901239553435


In [7]:
af = similar('vulpix','pikachu')

Probability of topics being related is 100%
Count is 406 and sum is 442.15721907345363


In [8]:
ag = similar('Need For Speed Underground 2','dota 2')

Probability of topics being related is 97%
Count is 303 and sum is 293.2367336913429


In [9]:
ah = similar('beyonce','dota 2')

Probability of topics being related is 51%
Count is 332 and sum is 170.9624639542081


In [10]:
ai = similar('Beyonce','Rihanna')

Probability of topics being related is 100%
Count is 444 and sum is 601.5158411768153


In [11]:
aj = similar('cupcakes','rihanna')

Probability of topics being related is 9%
Count is 361 and sum is 33.01681238158996


In [12]:
ak = similar('donald trump','rihanna')

Probability of topics being related is 12%
Count is 382 and sum is 44.33359196834941


In [13]:
al = similar('Banksy','Van Gogh')

Probability of topics being related is 70%
Count is 284 and sum is 200.17005019197885


In [14]:
am = similar('Banksy','Piet Mondrian')

Probability of topics being related is 53%
Count is 423 and sum is 222.29305752246182


In [15]:
an = similar('Nadir Afonso','Banksy')

Probability of topics being related is 62%
Count is 403 and sum is 250.78266744300456


In [16]:
ao = similar('DBS Bank','goldman sachs')

Probability of topics being related is 57%
Count is 290 and sum is 164.2372419713452


In [17]:
ap = similar('OCBC Bank','goldman sachs')

Probability of topics being related is 45%
Count is 291 and sum is 130.9037201603778


In [18]:
aq = similar('Citi Bank','goldman sachs')

Probability of topics being related is 79%
Count is 347 and sum is 274.1558126704726


In [19]:
ar = similar('OCBC Bank','Citi Bank')

Probability of topics being related is 69%
Count is 308 and sum is 213.73383084398023


In [20]:
as_ = similar('Malaysia','Singapore')

Probability of topics being related is 100%
Count is 363 and sum is 468.4736211684941


In [21]:
at = similar('Lee Kuan Yew','Lee Hsien Loong')

Probability of topics being related is 100%
Count is 577 and sum is 587.4734112926852


In [22]:
au = similar('Lee Kuan Yew','Mahathir Mohamad')

Probability of topics being related is 56%
Count is 417 and sum is 235.37218048841336


In [23]:
av = similar('Mahathir Mohamad','Lee Hsien Loong')

Probability of topics being related is 60%
Count is 450 and sum is 268.7364013255993


In [24]:
aw = similar('Mahathir Mohamad','Tharman Shanmugaratnam')

Probability of topics being related is 45%
Count is 414 and sum is 187.77477535562153


In [25]:
ax = similar('Malaysia','Finland')

Probability of topics being related is 100%
Count is 276 and sum is 400.2913561256497


In [26]:
ay = similar('Japanese language','Chinese language')

Probability of topics being related is 100%
Count is 407 and sum is 753.1391764922389


In [27]:
az = similar('Indian cuisine','Czech cuisine')

Probability of topics being related is 92%
Count is 276 and sum is 254.37678860068877


In [28]:
ba = similar('Amsterdam','Utrecht')

Probability of topics being related is 100%
Count is 341 and sum is 674.2563667646681


In [29]:
bb = similar('Amsterdam','London')

Probability of topics being related is 100%
Count is 309 and sum is 417.13302747924024


In [30]:
bc = similar('Amsterdam','Sydney')

Probability of topics being related is 100%
Count is 404 and sum is 519.7419962886257


In [31]:
bd = similar('Angela Merkel','Berlin')

Probability of topics being related is 56%
Count is 421 and sum is 234.95015383478332


In [32]:
be = similar('Angela Merkel','London')

Probability of topics being related is 26%
Count is 382 and sum is 97.95924354468332


In [33]:
bf = similar('Adolf Hitler','Berlin')

Probability of topics being related is 56%
Count is 379 and sum is 212.67261588852023


In [34]:
bg = similar('Adolf Hitler','London')

Probability of topics being related is 25%
Count is 343 and sum is 85.12139632497757


In [35]:
bh = similar('Adolf Hitler','Donald Trump')

Probability of topics being related is 22%
Count is 346 and sum is 77.29692631067833


In [36]:
bi = similar('Angela Merkel','Donald Trump')

Probability of topics being related is 20%
Count is 381 and sum is 74.51851168072163


In [37]:
bj = similar('Amsterdam','Tiesto')

Probability of topics being related is 28%
Count is 352 and sum is 97.39534634503472


In [38]:
bk = similar('Amsterdam','Pikachu')

Probability of topics being related is 13%
Count is 305 and sum is 39.79759661778893


In [39]:
bl = similar('Amsterdam','Dota 2')

Probability of topics being related is 27%
Count is 241 and sum is 65.93178606949283


In [40]:
bm = similar('Amsterdam','Need For Speed Underground 2')

Probability of topics being related is 17%
Count is 320 and sum is 55.76880844755138


In [41]:
bo = similar('Amsterdam','Beyonce')

Probability of topics being related is 26%
Count is 352 and sum is 92.54215814160068


In [42]:
bp = similar('Amsterdam','Rihanna')

Probability of topics being related is 20%
Count is 321 and sum is 63.818221327972324


In [43]:
bq = similar('Amsterdam','Cupcakes')

Probability of topics being related is 16%
Count is 289 and sum is 45.36412234617406


In [44]:
br = similar('Amsterdam','Singapore')

Probability of topics being related is 74%
Count is 339 and sum is 252.5312192688828


In [2]:
bs = similar('Amsterdam','Donald Trump')

Probability of topics being related is 21%
Count is 307 and sum is 63.14353331827822


In [3]:
bt = similar('Amsterdam','Van Gogh')

Probability of topics being related is 70%
Count is 226 and sum is 157.19175849634664


In [4]:
bu = similar('Obama','Goldman Sachs')

Probability of topics being related is 63%
Count is 524 and sum is 330.11948572559135


In [5]:
bv = similar('Obama','Nickelback')

Probability of topics being related is 0%
Count is 638 and sum is -2.642348164905025


In [6]:
bw = similar('21 Guns (band)','Nickelback')

Probability of topics being related is 64%
Count is 374 and sum is 238.5787280544532


In [7]:
bx = similar('21 Pilots','Nickelback')

Probability of topics being related is 98%
Count is 511 and sum is 499.01141769108483


In [8]:
by = similar('21 Pilots','Cobra Starship')

Probability of topics being related is 100%
Count is 393 and sum is 459.3512917641034


In [9]:
bz = similar('My Chemical Romance','Linkin Park')

Probability of topics being related is 100%
Count is 325 and sum is 471.88573151408707


In [10]:
ca = similar('My Chemical Romance','Imagine Dragons')

Probability of topics being related is 100%
Count is 342 and sum is 479.0128555982487


In [11]:
cb = similar('My Chemical Romance','Jolin Tsai')

Probability of topics being related is 48%
Count is 409 and sum is 194.73976793210213


In [12]:
cc = similar('Jay Chou','Jolin Tsai')

Probability of topics being related is 93%
Count is 555 and sum is 514.783025771669


In [13]:
cd = similar('Paramore','Nickelback')

Probability of topics being related is 100%
Count is 420 and sum is 494.5227264262206


In [14]:
ce = similar('Paramore','Kelly Clarkson')

Probability of topics being related is 95%
Count is 439 and sum is 417.3814388364541


In [15]:
cf = similar('Python (programming language)','Java (programming language)')

Probability of topics being related is 72%
Count is 599 and sum is 429.0715403159629


In [16]:
cg = similar('HTML','CSS')

Probability of topics being related is 100%
Count is 293 and sum is 536.7343627269529


In [17]:
ch = similar('Microsoft','Apple (company)')

Probability of topics being related is 83%
Count is 276 and sum is 227.76846922189756


In [18]:
ci = similar('Johnny Depp','Michael Bay')

Probability of topics being related is 48%
Count is 362 and sum is 174.3002489012302


In [19]:
cj = similar('Johnny Depp','Jennifer Lawrence')

Probability of topics being related is 75%
Count is 380 and sum is 285.6707896102245


In [20]:
ck = similar('SAS (software)','SPSS')

Probability of topics being related is 86%
Count is 383 and sum is 327.5233825374809


In [21]:
cl = similar('saudi arabia','israel')

Probability of topics being related is 97%
Count is 440 and sum is 428.0399802437399


In [22]:
cm = similar('European migrant crisis','World War II')

Probability of topics being related is 36%
Count is 343 and sum is 123.97747053161865


In [23]:
cn = similar('Great Britain','World War II')

Probability of topics being related is 57%
Count is 462 and sum is 264.85063698299575


In [24]:
co = similar('Singapore','World War II')

Probability of topics being related is 41%
Count is 422 and sum is 173.45757342485018


In [25]:
cp = similar('Iceland','World War II')

Probability of topics being related is 36%
Count is 322 and sum is 115.16066836106272


In [26]:
cq = similar('Egypt','World War II')

Probability of topics being related is 76%
Count is 402 and sum is 306.39916566339974


In [27]:
cr = similar('Mueller Report','Russia')

Probability of topics being related is 17%
Count is 481 and sum is 81.89119722331667


In [28]:
cs = similar('Mueller Report','China')

Probability of topics being related is 20%
Count is 505 and sum is 99.82816582343789


In [29]:
ct = similar('Facebook','Linkedin')

Probability of topics being related is 100%
Count is 214 and sum is 389.62525227484116


In [30]:
cu = similar('Facebook','Google')

Probability of topics being related is 100%
Count is 334 and sum is 372.42853758792154


In [31]:
cv = similar('Chemistry','Physics')

Probability of topics being related is 51%
Count is 552 and sum is 281.9426915939084


In [32]:
cw = similar('Biology','Physics')

Probability of topics being related is 55%
Count is 551 and sum is 300.70321249895807


In [33]:
cx = similar('Chemistry','Biology')

Probability of topics being related is 53%
Count is 485 and sum is 256.84843314172775


In [34]:
cy = similar('Chemistry','Periodic Table')

Probability of topics being related is 65%
Count is 420 and sum is 273.6041684846002


In [35]:
cz = similar('Physics','Periodic Table')

Probability of topics being related is 17%
Count is 477 and sum is 80.38187655856315


In [36]:
da = similar('Biology','Periodic Table')

Probability of topics being related is 13%
Count is 418 and sum is 55.28261150003667


In [37]:
db = similar('DNA','Biology')

Probability of topics being related is 43%
Count is 398 and sum is 171.20913151410127


In [38]:
dc = similar('DNA','Chemistry')

Probability of topics being related is 40%
Count is 398 and sum is 159.63128569324357


In [39]:
dd = similar('DNA','Physics')

Probability of topics being related is 18%
Count is 450 and sum is 82.1725744516854


In [40]:
de = similar('coffee','tea')

Probability of topics being related is 40%
Count is 364 and sum is 144.78125137184156


In [41]:
df = similar('coffee','milk')

Probability of topics being related is 77%
Count is 324 and sum is 248.281231851635


In [42]:
dg = similar('apple','orange (fruit)')

Probability of topics being related is 85%
Count is 377 and sum is 320.41958022931817


In [43]:
dh = similar('apple','orange (colour)')

Probability of topics being related is 35%
Count is 358 and sum is 124.16508274153114


In [44]:
di = similar('love','hate')

Probability of topics being related is 59%
Count is 553 and sum is 325.7764438535962


In [45]:
dj = similar('Housing and Development Board','Government Technology Agency')

Probability of topics being related is 100%
Count is 484 and sum is 726.8472301748425


In [46]:
dk = similar('Information technology','Government Technology Agency')

Probability of topics being related is 51%
Count is 461 and sum is 236.56240815081927


In [47]:
dl = similar('Simcity 4','The Sims')

Probability of topics being related is 100%
Count is 230 and sum is 463.9831426179662


In [48]:
dm = similar('Simcity 4','Cities Skylines')

Probability of topics being related is 100%
Count is 279 and sum is 651.0968467958069


In [49]:
dn = similar('Pokemon','Cities Skylines')

Probability of topics being related is 96%
Count is 308 and sum is 294.9798926984053


In [50]:
do = similar('Counter-Strike','Cities Skylines')

Probability of topics being related is 100%
Count is 292 and sum is 308.35308941266754


In [51]:
dp = similar('Counter-Strike','Fortnite')

Probability of topics being related is 100%
Count is 290 and sum is 293.5552671021677


In [52]:
dq = similar('Counter-Strike','Overwatch (video game)')

Probability of topics being related is 100%
Count is 313 and sum is 366.24059488709463


In [53]:
dr = similar('Overcooked','Overwatch (video game)')

Probability of topics being related is 100%
Count is 311 and sum is 502.01156316544757


In [54]:
ds = similar('Overcooked','League of Legends')

Probability of topics being related is 100%
Count is 363 and sum is 433.0443079690143


In [55]:
dt = similar('Simcity 4','Periodic Table')

Probability of topics being related is 16%
Count is 325 and sum is 51.588672977084926


In [56]:
du = similar('Only Girl (In the World)',"Crazy in Love")

Probability of topics being related is 100%
Count is 374 and sum is 624.9633662053797


In [57]:
dv = similar('A Girl like Me (Rihanna album)',"Dangerously in Love")

Probability of topics being related is 100%
Count is 388 and sum is 684.0309687945253


In [2]:
dw = similar('Christianity',"Buddhism")

Probability of topics being related is 45%
Count is 533 and sum is 239.97524432231438


In [3]:
dx = similar('Christianity',"Islam")

Probability of topics being related is 75%
Count is 560 and sum is 420.39791529711846


In [4]:
dy = similar('Buddhism',"Islam")

Probability of topics being related is 79%
Count is 381 and sum is 300.2454279457024


In [5]:
dz = similar('Neural Network',"Logistic regression")

Probability of topics being related is 46%
Count is 354 and sum is 163.00086527859443


In [6]:
ea = similar('Reddit',"Logistic regression")

Probability of topics being related is 3%
Count is 224 and sum is 7.145414660396291


In [7]:
eb = similar('Reddit',"Facebook")

Probability of topics being related is 100%
Count is 214 and sum is 348.8019630013946


In [8]:
ec = similar('Reddit',"Twitter")

Probability of topics being related is 100%
Count is 228 and sum is 239.12350112048333


In [9]:
ed = similar('Reddit',"Quora")

Probability of topics being related is 100%
Count is 213 and sum is 311.9104888637269


In [10]:
ee = similar('Google',"Quora")

Probability of topics being related is 63%
Count is 332 and sum is 208.45242575081596
