In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora, models, similarities
import numpy as np
import ast

In [2]:
df_bow_snippets = pd.read_csv('datasets/bag_of_word_snippets.csv')
df_bow_claims = pd.read_csv('datasets/bag_of_word_claims.csv')
df_bow_claims = df_bow_claims.groupby('claim_id').first().reset_index()
print(df_bow_snippets.head(2))
print(df_bow_claims.head(2))

   Unnamed: 0  snippet_id                                    snippet_content
0           0         968  ['past', 'week', 'republican', 'members_congre...
1           1         969  ['european_union', 'long_criticized', 'east_eu...
   claim_id  Unnamed: 0                                      claim_content
0         9           0     ['taxes', 'spent', 'so_called', "'war_terror"]
1        18         134  ['says', 'north_korea', 'agreed_denuclearizati...


In [3]:
df_bow_snippets['snippet_content'] = df_bow_snippets['snippet_content'].apply(lambda st: ast.literal_eval(st))
df_bow_claims['claim_content'] = df_bow_claims['claim_content'].apply(lambda st: ast.literal_eval(st))

In [4]:
df_groundtruth = pd.read_csv('datasets/relevance_discovery_groundtruth.csv')
df_groundtruth.head(2)

Unnamed: 0.1,Unnamed: 0,snippet_id,groundtruth_label,claim_id,snippet_date,claim_content,snippet_content
0,0,887759,True,4229,16577,We spend more money on lobbying than we do on ...,22/05/2015 · Will London's mayor put the brake...
1,1,887760,False,4229,16577,We spend more money on lobbying than we do on ...,22/05/2015 · ... Facebook for First Draft upda...


In [5]:
df_bow_snippets = df_bow_snippets[['snippet_id','snippet_content']]
df_bow_snippets.columns = ['snippet_id','snippet_bow']

df_bow_claims = df_bow_claims[['claim_id','claim_content']]
df_bow_claims.columns = ['claim_id','claim_bow']

df_groundtruth = df_groundtruth[['claim_id','snippet_id','groundtruth_label']]

In [6]:
df_merge = pd.merge(pd.merge(df_groundtruth, df_bow_snippets), df_bow_claims)
print(len(df_merge))
df_merge.head(2)

534


Unnamed: 0,claim_id,snippet_id,groundtruth_label,snippet_bow,claim_bow
0,4229,887759,True,"[london, mayor, put, brakes, better, ways, spo...","[spend, money, lobbying, campaigns]"
1,4229,887760,False,"[facebook, first, draft, updates, liberals, sp...","[spend, money, lobbying, campaigns]"


In [9]:
for nb_word in sorted(list(np.arange(10000,40000,5000)),reverse=True):
    dictionary = corpora.Dictionary.load('dictionaries/factchecking_'+str(nb_word/1000)+'.dict')
    #tfidf = models.TfidfModel.load('topic_models/'+str(nb_word/1000)+'.tfidf')
    
    df_merge['snippet_bow_'+str(nb_word/1000)] = df_merge['snippet_bow'].apply(lambda x: dictionary.doc2bow(x))
    tfidf = models.TfidfModel(list(df_merge['snippet_bow_'+str(nb_word/1000)].as_matrix()))
    
    df_merge['claim_bow_'+str(nb_word/1000)] = df_merge['claim_bow'].apply(lambda x: dictionary.doc2bow(x))
    df_merge['claim_tfidf_'+str(nb_word/1000)] = df_merge['claim_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    
    df_merge['snippet_tfidf_'+str(nb_word/1000)] = df_merge['snippet_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    """
    for nb_topic in np.arange(100,350,50):
        
        lsi = models.LsiModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.lsi')
        rp = models.RpModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.rp')
        lda = models.RpModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.lda')
        
        df_merge['claim_lsi_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['claim_tfidf_'+str(nb_word/1000)].apply(lambda x: lsi[x])
        df_merge['claim_rp_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['claim_bow_'+str(nb_word/1000)].apply(lambda x: rp[x])
        df_merge['claim_lda_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['claim_bow_'+str(nb_word/1000)].apply(lambda x: lda[x])
        
        df_merge['snippet_lsi_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['snippet_tfidf_'+str(nb_word/1000)].apply(lambda x: lsi[x])
        df_merge['snippet_rp_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['snippet_bow_'+str(nb_word/1000)].apply(lambda x: rp[x])
        df_merge['snippet_lda_'+str(nb_word/1000)+'_'+str(nb_topic)] = \
        df_merge['snippet_bow_'+str(nb_word/1000)].apply(lambda x: lda[x])
    """    

In [14]:
claim_ids = sorted(list(df_merge['claim_id'].unique()),reverse=True)
print(claim_ids)

[14203, 13683, 11160, 5779, 4338, 4229, 1085, 292, 187, 121]


In [15]:
dic_similarities = {}

dic_similarities['snippet_id'] = list(df_merge[['claim_id','snippet_id']]\
.sort_values(by=['claim_id','snippet_id'],ascending=False)['snippet_id'].as_matrix())


In [16]:
for nb_word in sorted(list(np.arange(10000,40000,5000)),reverse=True):
    """
    for nb_topic in np.arange(100,350,50):
        lsi = models.LsiModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.lsi')
        rp = models.RpModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.rp')
        lda = models.RpModel.load('topic_models/'+str(nb_word/1000)+'_'+str(nb_topic)+'.lda')
    
        dic_similarities['lsi'+str(nb_word/1000)+'_'+str(nb_topic)] = []
        dic_similarities['rp'+str(nb_word/1000)+'_'+str(nb_topic)] = []
        dic_similarities['lda'+str(nb_word/1000)+'_'+str(nb_topic)] = []
        print("Beginning nb topic = "+str(nb_topic))
        for claim_id in claim_ids:
            df_ = df_merge[df_merge['claim_id']==claim_id][['snippet_id','snippet_bow_'+str(nb_word/1000),
                                                                        'snippet_tfidf_'+str(nb_word/1000),
                                                                    'claim_bow_'+str(nb_word/1000),
                                                                        'claim_tfidf_'+str(nb_word/1000)]]\
            .sort_values(by=['snippet_id'],ascending=False)
            claim_bow = list(df_['claim_bow_'+str(nb_word/1000)].as_matrix())[0]
            claim_tfidf = list(df_['claim_tfidf_'+str(nb_word/1000)].as_matrix())[0]
        
            snippet_bow = list(df_['snippet_bow_'+str(nb_word/1000)].as_matrix())
            snippet_tfidf = list(df_['snippet_tfidf_'+str(nb_word/1000)].as_matrix())
        
            lsi_index = similarities.MatrixSimilarity(lsi[snippet_tfidf])
            rp_index = similarities.MatrixSimilarity(rp[snippet_bow])
            lda_index = similarities.MatrixSimilarity(lda[snippet_bow])
        
            sims_lsi = lsi_index[lsi[claim_tfidf]]
            sims_rp = rp_index[rp[claim_bow]]
            sims_lda = lda_index[lda[claim_bow]]
        
            for i in range(0,len(snippet_bow)):
                dic_similarities['lsi'+str(nb_word/1000)+'_'+str(nb_topic)].append(sims_lsi[i])
                dic_similarities['rp'+str(nb_word/1000)+'_'+str(nb_topic)].append(sims_rp[i])
                dic_similarities['lda'+str(nb_word/1000)+'_'+str(nb_topic)].append(sims_lda[i])
    """
    dic_similarities['bow'+str(nb_word/1000)] = []
    dic_similarities['tfidf'+str(nb_word/1000)] = []
    for claim_id in claim_ids:
        df_ = df_merge[df_merge['claim_id']==claim_id][['snippet_id','snippet_bow_'+str(nb_word/1000),
                                                                        'snippet_tfidf_'+str(nb_word/1000),
                                                                    'claim_bow_'+str(nb_word/1000),
                                                                        'claim_tfidf_'+str(nb_word/1000)]]\
        .sort_values(by=['snippet_id'],ascending=False)
        claim_bow = list(df_['claim_bow_'+str(nb_word/1000)].as_matrix())[0]
        claim_tfidf = list(df_['claim_tfidf_'+str(nb_word/1000)].as_matrix())[0]
        snippet_bow = list(df_['snippet_bow_'+str(nb_word/1000)].as_matrix())
        snippet_tfidf = list(df_['snippet_tfidf_'+str(nb_word/1000)].as_matrix())
        index_bow = similarities.MatrixSimilarity(snippet_bow)
        index_tfidf = similarities.MatrixSimilarity(snippet_tfidf)
        sims_bow = index_bow[claim_bow]
        sims_tfidf = index_tfidf[claim_tfidf]
        for i in range(0,len(snippet_bow)):
            dic_similarities['bow'+str(nb_word/1000)].append(sims_bow[i])
            dic_similarities['tfidf'+str(nb_word/1000)].append(sims_tfidf[i])
            
df_similarities = pd.DataFrame(dic_similarities)

In [17]:
df_results_final = pd.merge(df_merge[['snippet_id','groundtruth_label']], df_similarities)
df_results_final.to_csv('datasets/similarity_results_2.csv')