In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
corpora = [
    # Query
    "corona vaccine fake news",
    # Document 1
    "WHO is gathering the latest international multilingual scientific findings and knowledge on COVID-19. The global literature cited in the WHO COVID-19 database is updated daily (Monday through Friday) from searches of bibliographic databases, hand searching, and the addition of other expert-referred scientific articles. This database represents a comprehensive multilingual source of current literature on the topic. While it may not be exhaustive, new research is added regularly.",
    # Document 2
    "A COVID-19 vaccine candidate made of tiny artificial particles could be more powerful than other leading varieties at triggering a protective immune response. When the team injected mice with the nanoparticle vaccine, the animals produced virus-blocking antibodies at levels comparable to or greater than those produced by people who had recovered from COVID-19. Mice that received the vaccine produced about ten times more of these antibodies than did rodents vaccinated only with the spike protein, on which many COVID-19 vaccine candidates rely.",
    # Document 3
    "The rise of fake news in the American popular consciousness is one of the remarkable growth stories in recent years—a dizzying climb to make any Silicon Valley unicorn jealous. Just a few years ago, the phrase was meaningless. Today, according to a new Pew Research Center study, Americans rate it as a larger problem than racism, climate change, or terrorism.",
    # Document 4
    "\"Falsehood flies, and the Truth comes limping after it,\" Jonathan Swift once wrote. It was hyperbole three centuries ago. But it is a factual description of social media, according to an ambitious and first-of-its-kind study published Thursday in Science. The massive new study analyzes every major contested news story in English across the span of Twitter’s existence—some 126,000 stories, tweeted by 3 million users, over more than 10 years—and finds that the truth simply cannot compete with hoax and rumor. By every common metric, falsehood consistently dominates the truth on Twitter, the study finds: Fake news and false rumors reach more people, penetrate deeper into the social network, and spread much faster than accurate stories.",
    # Document 5
    "The anti-vaccination movement has gained traction online in recent years, and campaigners opposed to vaccination have moved their focus to making claims relating to the coronavirus. First, a video containing inaccurate claims about coronavirus vaccine trials, made by osteopath Carrie Madej, that has proved popular on social media. Carrie \'s video makes a false claim that the vaccines will change recipients\' DNA (which carries genetic information).\"The Covid-19 vaccines are designed to make us into genetically modified organisms.\" She also claims—without any evidence—that vaccines will \"hook us all up to an artificial intelligence interface\"."
]

In [3]:
# Convert a data to a matrix of TF-IDF features.
tvec = TfidfVectorizer()
transformed_weights = tvec.fit_transform(corpora)
print(tvec.vocabulary_)

{'corona': 57, 'vaccine': 246, 'fake': 79, 'news': 152, 'who': 256, 'is': 114, 'gathering': 92, 'the': 220, 'latest': 123, 'international': 112, 'multilingual': 148, 'scientific': 199, 'findings': 84, 'and': 19, 'knowledge': 121, 'on': 155, 'covid': 60, '19': 3, 'global': 95, 'literature': 127, 'cited': 43, 'in': 106, 'database': 63, 'updated': 241, 'daily': 62, 'monday': 143, 'through': 226, 'friday': 89, 'from': 90, 'searches': 200, 'of': 154, 'bibliographic': 30, 'databases': 64, 'hand': 99, 'searching': 201, 'addition': 9, 'other': 164, 'expert': 77, 'referred': 186, 'articles': 25, 'this': 223, 'represents': 191, 'comprehensive': 52, 'source': 207, 'current': 61, 'topic': 232, 'while': 255, 'it': 115, 'may': 136, 'not': 153, 'be': 29, 'exhaustive': 75, 'new': 151, 'research': 192, 'added': 8, 'regularly': 187, 'candidate': 35, 'made': 128, 'tiny': 229, 'artificial': 26, 'particles': 166, 'could': 59, 'more': 144, 'powerful': 172, 'than': 218, 'leading': 124, 'varieties': 249, 'at'

In [4]:
display(pd.DataFrame(transformed_weights.toarray(), columns=tvec.get_feature_names_out(),
                                   index=['Query', 'Document_1', 'Document_2', 'Document_3', 'Document_4',
                                          'Document_5']))

Unnamed: 0,000,10,126,19,about,according,accurate,across,added,addition,after,ago,all,also,ambitious,american,americans,an,analyzes,and,animals,anti,antibodies,any,are,articles,artificial,as,at,be,bibliographic,blocking,but,by,campaigners,candidate,candidates,cannot,carrie,carries,center,centuries,change,cited,claim,claims,climate,climb,comes,common,comparable,compete,comprehensive,consciousness,consistently,containing,contested,corona,coronavirus,could,covid,current,daily,database,databases,deeper,description,designed,did,dizzying,dna,dominates,english,every,evidence,exhaustive,existence,expert,factual,fake,false,falsehood,faster,few,findings,finds,first,flies,focus,friday,from,gained,gathering,genetic,genetically,global,greater,growth,had,hand,has,have,hoax,hook,hyperbole,immune,in,inaccurate,information,injected,intelligence,interface,international,into,is,it,its,jealous,jonathan,just,kind,knowledge,larger,latest,leading,levels,limping,literature,made,madej,major,make,makes,making,many,massive,may,meaningless,media,metric,mice,million,modified,monday,more,moved,movement,much,multilingual,nanoparticle,network,new,news,not,of,on,once,one,online,only,opposed,or,organisms,osteopath,other,over,particles,penetrate,people,pew,phrase,popular,powerful,problem,produced,protective,protein,proved,published,racism,rate,reach,received,recent,recipients,recovered,referred,regularly,relating,rely,remarkable,represents,research,response,rise,rodents,rumor,rumors,science,scientific,searches,searching,she,silicon,simply,social,some,source,span,spike,spread,stories,story,study,swift,team,ten,terrorism,than,that,the,their,these,this,those,three,through,thursday,times,tiny,to,today,topic,traction,trials,triggering,truth,tweeted,twitter,unicorn,up,updated,us,users,vaccinated,vaccination,vaccine,vaccines,valley,varieties,video,virus,was,when,which,while,who,will,with,without,wrote,years
Query,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.640461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document_1,0.0,0.0,0.0,0.157903,0.0,0.0,0.0,0.0,0.11404,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157903,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.093515,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157903,0.11404,0.11404,0.22808,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.11404,0.093515,0.0,0.11404,0.0,0.0,0.11404,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.067655,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.236854,0.078951,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.11404,0.0,0.0,0.0,0.22808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.22808,0.0,0.0,0.078951,0.0,0.11404,0.202966,0.135311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.11404,0.0,0.0,0.0,0.11404,0.093515,0.0,0.0,0.0,0.0,0.0,0.0,0.22808,0.11404,0.11404,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292129,0.0,0.0,0.11404,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11404,0.187029,0.0,0.0,0.0,0.0,0.0
Document_2,0.0,0.0,0.0,0.207336,0.08186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.199655,0.0,0.0,0.0,0.08186,0.0,0.199655,0.08186,0.0,0.099828,0.0,0.069112,0.0,0.099828,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.207336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08186,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.099828,0.0,0.0,0.08186,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.199655,0.0,0.0,0.0,0.16372,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.118447,0.059224,0.0,0.0,0.0,0.099828,0.0,0.08186,0.0,0.0,0.08186,0.0,0.099828,0.0,0.08186,0.0,0.0,0.0,0.099828,0.0,0.299483,0.099828,0.099828,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.099828,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.099828,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.099828,0.099828,0.0,0.207336,0.069112,0.255722,0.0,0.099828,0.0,0.099828,0.0,0.0,0.0,0.099828,0.099828,0.059224,0.0,0.0,0.0,0.0,0.099828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099828,0.0,0.276448,0.0,0.0,0.099828,0.0,0.099828,0.0,0.099828,0.08186,0.0,0.08186,0.0,0.16372,0.0,0.0,0.0
Document_3,0.0,0.0,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.143911,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.118009,0.0,0.0,0.0,0.143911,0.143911,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099631,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099631,0.099631,0.0,0.143911,0.0,0.143911,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099631,0.099631,0.0,0.170753,0.0,0.0,0.143911,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.143911,0.118009,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.143911,0.143911,0.0,0.0,0.118009,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.118009,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118009,0.0,0.118009,0.0,0.0,0.0,0.143911,0.099631,0.0,0.294917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170753,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143911,0.0,0.0,0.0,0.118009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199262
Document_4,0.082005,0.082005,0.082005,0.0,0.0,0.067245,0.082005,0.082005,0.0,0.0,0.082005,0.067245,0.0,0.0,0.082005,0.0,0.0,0.067245,0.082005,0.340639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.113546,0.0,0.0,0.0,0.082005,0.0,0.0,0.0,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.082005,0.0,0.082005,0.0,0.0,0.082005,0.0,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.082005,0.0,0.0,0.0,0.0,0.082005,0.082005,0.16401,0.0,0.0,0.082005,0.0,0.082005,0.056773,0.067245,0.16401,0.082005,0.0,0.0,0.16401,0.067245,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.0,0.082005,0.0,0.097301,0.0,0.0,0.0,0.0,0.0,0.0,0.067245,0.056773,0.17032,0.082005,0.0,0.082005,0.0,0.082005,0.0,0.0,0.0,0.0,0.0,0.082005,0.0,0.0,0.0,0.082005,0.0,0.0,0.0,0.0,0.082005,0.0,0.0,0.067245,0.082005,0.0,0.082005,0.0,0.0,0.134491,0.0,0.0,0.082005,0.0,0.0,0.082005,0.056773,0.113546,0.0,0.145951,0.04865,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.0,0.082005,0.067245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.0,0.0,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082005,0.082005,0.082005,0.0,0.0,0.0,0.0,0.0,0.082005,0.134491,0.082005,0.0,0.082005,0.0,0.082005,0.134491,0.082005,0.201736,0.082005,0.0,0.0,0.0,0.113546,0.056773,0.294094,0.0,0.0,0.0,0.0,0.082005,0.0,0.082005,0.0,0.0,0.04865,0.0,0.0,0.0,0.0,0.0,0.246015,0.082005,0.16401,0.0,0.0,0.0,0.0,0.082005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067245,0.0,0.0,0.0,0.0,0.0,0.067245,0.0,0.082005,0.056773
Document_5,0.0,0.0,0.0,0.06401,0.075817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092459,0.092459,0.0,0.0,0.0,0.075817,0.0,0.06401,0.0,0.092459,0.0,0.075817,0.092459,0.0,0.075817,0.0,0.0,0.0,0.0,0.0,0.0,0.06401,0.092459,0.0,0.0,0.0,0.184917,0.092459,0.0,0.0,0.075817,0.0,0.092459,0.277376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092459,0.0,0.0,0.184917,0.0,0.06401,0.0,0.0,0.0,0.0,0.0,0.0,0.092459,0.0,0.0,0.092459,0.0,0.0,0.0,0.092459,0.0,0.0,0.0,0.0,0.0,0.075817,0.0,0.0,0.0,0.0,0.0,0.075817,0.0,0.092459,0.0,0.0,0.092459,0.0,0.092459,0.092459,0.0,0.0,0.0,0.0,0.0,0.184917,0.092459,0.0,0.092459,0.0,0.0,0.054852,0.092459,0.092459,0.0,0.092459,0.092459,0.0,0.075817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075817,0.092459,0.0,0.075817,0.092459,0.092459,0.0,0.0,0.0,0.0,0.075817,0.0,0.0,0.0,0.092459,0.0,0.0,0.092459,0.092459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054852,0.0,0.0,0.092459,0.0,0.092459,0.0,0.092459,0.092459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075817,0.0,0.0,0.0,0.0,0.0,0.092459,0.0,0.0,0.0,0.0,0.0,0.075817,0.092459,0.0,0.0,0.0,0.092459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092459,0.0,0.0,0.075817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192031,0.189476,0.092459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27426,0.0,0.0,0.092459,0.092459,0.0,0.0,0.0,0.0,0.0,0.092459,0.0,0.184917,0.0,0.0,0.184917,0.06401,0.277376,0.0,0.0,0.184917,0.0,0.0,0.0,0.075817,0.0,0.0,0.184917,0.0,0.092459,0.0,0.06401


In [5]:
# Cosine similarity (query, documents)
similarity = cosine_similarity(transformed_weights[0], transformed_weights[1:])
similarity = pd.DataFrame(similarity, index=['Similarity'],
                          columns=['Document_1', 'Document_2', 'Document_3', 'Document_4',
                                   'Document_5'])

print('[Sort Similarities for queries in descending order]')
display(similarity.T.sort_values(by='Similarity', ascending=False).T)

[Sort Similarities for queries in descending order]


Unnamed: 0,Document_2,Document_3,Document_4,Document_5,Document_1
Similarity,0.122577,0.088353,0.07552,0.028382,0.0
