In [8]:
import pprint
from gensim import corpora
from gensim import models
from gensim import similarities
import pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
file = "data/2019-11-02_reddit-data-askscience_scrubbed.csv"

# define the stop words 
stop_words = stopwords.words('english')

In [37]:
def convert_csv_to_list(file):
    posts_df = pd.read_csv(file)
    
    # take only the combined column
    posts_df = posts_df["combined"]
    # Convert the column to a list
    corpus_text = list()
    for row in range(posts_df.shape[0]):
        temp = posts_df.iloc[row]
        corpus_text.append(temp)

    # Convert the list to list of lists
    processed_corpus = list()

    for text in corpus_text:
        # tokenize it 
        tokenized_list = word_tokenize(text)

        # convert to lower case
        tokenized_list = [w.lower() for w in tokenized_list]

        # get the alphabetic words
        words = [word for word in tokenized_list if word.isalpha()]

        # get rid of stop words 
        words = [w for w in words if not w in stop_words]

        processed_corpus.append(words)
    
    return processed_corpus

In [38]:
def train_model(corpus):
    dictionary_reddit = corpora.Dictionary(corpus)
    num_words = len(dictionary_reddit.keys())

    # convert to a bag of words reprensentation
    bow_corpus_reddit = [dictionary_reddit.doc2bow(text) for text in corpus]

    # train the model
    tfidf_reddit = models.TfidfModel(bow_corpus_reddit)

    # similarities model
    index_reddit = similarities.SparseMatrixSimilarity(tfidf_reddit[bow_corpus_reddit], 
                                                   num_features = num_words)
    return index_reddit, dictionary_reddit, tfidf_reddit   

In [39]:
def test_model(test_title, index, dictionary, model):   
    # tokenize words and convert to lower case
    tokenized_list = word_tokenize(test_title)

    # convert to lower case
    tokenized_list = [w.lower() for w in tokenized_list]

    # get the alphabetic words
    words = [word for word in tokenized_list if word.isalpha()]

    #get rid of stop words 
    words = [w for w in words if not w in stop_words]

    # bag of words representation of the query
    query_bow = dictionary.doc2bow(words)
    
    # create the similarities scores
    sims_reddit = index[model[query_bow]]

    # put scores in a dict
    sim_scores = dict()
    for document_number, score in sorted(enumerate(sims_reddit)):
        sim_scores[document_number] = score
    # sort the scores
    sorted_sim_scores = sorted(sim_scores.items(), key=lambda kv: kv[1], reverse = True)
    
    
    return sorted_sim_scores[:10]

## test using the functions

In [40]:
input_str = "scientist working oxygen"

In [41]:
list1 = convert_csv_to_list(file)

In [42]:
index, dictionary, model = train_model(list1)

In [43]:
test_model(input_str, index, dictionary, model)

[(806, 0.3506022),
 (243, 0.2874382),
 (267, 0.27009386),
 (273, 0.26514146),
 (8, 0.23303704),
 (872, 0.174325),
 (945, 0.15344457),
 (50, 0.13729197),
 (638, 0.1320878),
 (129, 0.12332527)]