In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora, models, similarities
import numpy as np
import ast

In [2]:
## We import the claim and the snippets

df_bow = pd.read_csv('data/02_bow_snippets_claims.csv')[['snippet_id','claim_id','snippet_content','claim_content']]
df_bow.columns = ['snippet_id','claim_id','snippet_bow','claim_bow']

In [3]:
## We evaluate strings from snippets and claims as list of terms

df_bow['snippet_bow'] = df_bow['snippet_bow'].apply(lambda st: ast.literal_eval(st))
df_bow['claim_bow'] = df_bow['claim_bow'].apply(lambda st: ast.literal_eval(st))

   snippet_id                                        snippet_bow
0         968  [past, week, republican, members_congress, nom...
1         969  [european_union, long_criticized, east_europea...
   claim_id                                     claim_bow
0         9        [taxes, spent, so_called, 'war_terror]
1        18  [says, north_korea, agreed_denuclearization]


In [16]:
## We transform list of terms into list of indices using the chosen dictionary

dictionary = corpora.Dictionary.load('dictionaries/02_30.0.dict')

df_bow['snippet_bow'] = df_bow['snippet_bow'].apply(lambda x: dictionary.doc2bow(x))
df_bow['claim_bow'] = df_bow['claim_bow'].apply(lambda x: dictionary.doc2bow(x))

In [17]:
## We apply the chosen topic model on list of indices from snippets and claims 

lda = models.LdaModel.load('topic_models/03_30.0_200.lda')
df_bow['snippet_bow'] = df_bow['snippet_bow'].apply(lambda x: lda[x])
df_bow['claim_bow'] = df_bow['claim_bow'].apply(lambda x: lda[x])

In [31]:
## For each pair of claim/snippet, we compute the similarity measure

dic_similarities = {}
dic_similarities['snippet_id'] = list(df_bow[['claim_id','snippet_id']]\
.sort_values(by=['claim_id','snippet_id'],ascending=False)['snippet_id'].as_matrix())
dic_similarities['similarity_score']=[]

claim_ids = sorted(list(df_bow['claim_id'].unique()),reverse=True)

for claim_id in claim_ids:
    df_ = df_bow[df_merge['claim_id']==claim_id][['snippet_id','snippet_bow','claim_bow']]\
    .sort_values(by=['snippet_id'],ascending=False)
    claim_bow = list(df_['claim_bow'].as_matrix())[0]
    snippet_bow = list(df_['snippet_bow'].as_matrix())
    lda_index = similarities.MatrixSimilarity(snippet_bow)
    max_ = np.max([x[0] for lst in snippet_bow for x in lst])
    sims_lda = lda_index[list(filter(lambda x: x[0]<=max_,claim_bow))]
    for i in range(0,len(snippet_bow)):
        dic_similarities['similarity_score'].append(sims_lda[i])
df_similarities = pd.DataFrame(dic_similarities)  

df_merge = pd.merge(df_bow,df_similarities)

In [34]:
## We keep only snippets with a similarity score above 0.65 to ensure that the snippets are relevant

df_filter = df_merge[df_merge['similarity_score']>=0.65]

Unnamed: 0,claim_id,snippet_id,snippet_bow,claim_bow,similarity_score
72,9,1414,"[(46, 0.09136364), (51, 0.09136364), (52, 0.09...","[(91, 0.25125), (108, 0.25125), (115, 0.251250...",0.738715
265,18,2517,"[(84, 0.30574563), (166, 0.088644795), (171, 0...","[(176, 0.66833335)]",0.662764


In [38]:
## We import data associated to the selected snippets

df_snippets_2 = pd.read_csv('data/00_dataset.csv')
df_final = pd.merge(df_snippets_2,df_filter[['snippet_id','similarity_score']])

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,claim_id,snippet_content,snippet_date,snippet_id,snippet_pagenum,snippet_title,snippet_url,claim_content,claim_date,claim_label,claim_tag,claim_url,similarity_score
0,72,1414,9,The swamp has declared war on ... comes to how...,17639,1414,6,Nancy Pelosi Archives - Empower Texans,https://empowertexans.com/tag/nancy-pelosi/,Most of our taxes are spent on the so-called ‘...,17645,False,— PolitiFact National,/truth-o-meter/statements/2018/apr/24/ro-khann...,0.738715
1,265,2517,18,Trump Says North Korea Agreed to Denuclearize....,17639,2517,1,Navy Releases Service Details for Hero …,https://www.military.com/daily-news/2018/04/18...,Says North Korea has agreed to denuclearization.,17643,False,— PolitiFact National,/truth-o-meter/statements/2018/apr/22/donald-t...,0.662764


In [44]:
## We export the data

df_final[['claim_id','snippet_content',
       'snippet_date', 'snippet_id', 'snippet_pagenum', 'snippet_title',
       'snippet_url', 'claim_content', 'claim_date', 'claim_label',
       'claim_tag', 'claim_url', 'similarity_score']].to_csv('data/06_relevent_snippets.csv', index=False)