In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora
from gensim import models
import numpy as np
import ast

In [2]:
## We import data

df_doc = pd.read_csv('data/09_bow_documents.csv')

Index(['Unnamed: 0', 'claim_id', 'document_content', 'similarity_score',
       'claim_label', 'claim_content'],
      dtype='object')

In [3]:
## We evaluate strings as lists of terms

df_doc['document_content'] = df_doc['document_content'].apply(lambda st: ast.literal_eval(st))
df_doc['claim_content'] = df_doc['claim_content'].apply(lambda st: ast.literal_eval(st))

In [4]:
## for each lexicon, we transform lists of terms into lists of indexes
## we build tfidf models that we export

for nb_word in sorted(list(np.arange(500,20000,500)),reverse=True):
    dictionary_doc = corpora.Dictionary.load('dictionaries/09_doc_'+str(nb_word/1000)+'.dict')
    df_doc['document_bow_'+str(nb_word/1000)] = df_doc['document_content'].apply(lambda x: dictionary_doc.doc2bow(x))
    #df_doc['claim_bow_'+str(nb_word/1000)] = df_doc['claim_content'].apply(lambda x: dictionary.doc2bow(x))
    tfidf = models.TfidfModel(list(df_doc['document_bow_'+str(nb_word/1000)].as_matrix()))
    df_doc['document_tfidf_'+str(nb_word/1000)] = df_doc['document_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    #df_doc['claim_tfidf_'+str(nb_word/1000)] = df_doc['claim_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    tfidf.save('topic_models/10_doc_'+str(nb_word/1000)+'.tfidf')
    
for nb_word in sorted(list(np.arange(60,186,10)),reverse=True):
    dictionary_claim = corpora.Dictionary.load('dictionaries/09_claim_'+str(nb_word/1000)+'.dict')
    df_doc['claim_bow_'+str(nb_word/1000)] = df_doc['claim_content'].apply(lambda x: dictionary_claim.doc2bow(x))
    #df_doc['claim_bow_'+str(nb_word/1000)] = df_doc['claim_content'].apply(lambda x: dictionary.doc2bow(x))
    tfidf = models.TfidfModel(list(df_doc['claim_bow_'+str(nb_word/1000)].as_matrix()))
    df_doc['claim_tfidf_'+str(nb_word/1000)] = df_doc['claim_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    #df_doc['claim_tfidf_'+str(nb_word/1000)] = df_doc['claim_bow_'+str(nb_word/1000)].apply(lambda x: tfidf[x])
    tfidf.save('topic_models/10_claim_'+str(nb_word/1000)+'.tfidf')

In [5]:
## for each lexicon, for each number of topics,
## for each topic modeling technique, we build models
## and export these models

for nb_word in sorted(list(np.arange(500,20000,500)),reverse=True):
    dictionary = corpora.Dictionary.load('dictionaries/09_doc_'+str(nb_word/1000)+'.dict')
    for nb_topic in np.arange(20,310,20):
        lsi = models.LsiModel(list(df_doc['document_tfidf_'+str(nb_word/1000)].as_matrix()), 
                              id2word=dictionary, num_topics=nb_topic)
        lsi.save('topic_models/10_doc_'+str(nb_word/1000)+'_'+str(nb_topic)+'.lsi')
        rp = models.RpModel(list(df_doc['document_bow_'+str(nb_word/1000)].as_matrix()), 
                            num_topics=nb_topic)
        rp.save('topic_models/10_doc_'+str(nb_word/1000)+'_'+str(nb_topic)+'.rp')
        lda = models.LdaModel(list(df_doc['document_bow_'+str(nb_word/1000)].as_matrix()), 
                              id2word=dictionary, num_topics=nb_topic)
        lda.save('topic_models/10_doc_'+str(nb_word/1000)+'_'+str(nb_topic)+'.lda')
        
for nb_word in sorted(list(np.arange(60,186,10)),reverse=True):
    dictionary = corpora.Dictionary.load('dictionaries/09_claim_'+str(nb_word/1000)+'.dict')
    for nb_topic in np.arange(10,60,10):
        lsi = models.LsiModel(list(df_doc['claim_tfidf_'+str(nb_word/1000)].as_matrix()), 
                              id2word=dictionary, num_topics=nb_topic)
        lsi.save('topic_models/10_claim_'+str(nb_word/1000)+'_'+str(nb_topic)+'.lsi')
        rp = models.RpModel(list(df_doc['claim_bow_'+str(nb_word/1000)].as_matrix()), 
                            num_topics=nb_topic)
        rp.save('topic_models/10_claim_'+str(nb_word/1000)+'_'+str(nb_topic)+'.rp')
        lda = models.LdaModel(list(df_doc['claim_bow_'+str(nb_word/1000)].as_matrix()), 
                              id2word=dictionary, num_topics=nb_topic)
        lda.save('topic_models/10_claim_'+str(nb_word/1000)+'_'+str(nb_topic)+'.lda')

In [7]:
##We export the data

df_doc.to_csv('data/10_tfidf.csv')