This notebook shows the simplest E2E workflow for the FSM that demonstrates the workflow. The results can then be evaluated using the subjective, offline evaluation scale.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import needed external libraries
import os
import sys
import pandas as pd
import string, re

import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import sklearn

# Include the `src` module in the module path for imports
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# import custom functions
from src.functions import prep

In [3]:
#read in needed data
df_comments = pd.read_csv('../../src/data/csv_exports/q_comments.csv')
df_metadata = pd.read_csv('../../src/data/csv_exports/metadata.csv')

In [4]:
# extract article titles
titles = df_metadata['article_title']

In [5]:
# extract comments text
q_comments = df_comments['comment_text']

In [6]:
# define preprocessing steps
def pre_vec(q_list):
    pre_vec = []
    
    for q in q_list:
        q = prep.tokenize(q)
        q = prep.lowercase(q)
#         q = prep.rem_stop(q)
        q = prep.stem(q)
        q = ' '.join(q)
        
        pre_vec.append(q)
        
    return pre_vec

In [7]:
# preprocess article titles and comments text
titles_pre_vec = pre_vec(titles)
qc_pre_vec = pre_vec(q_comments)

In [8]:
# initialize tfidf vectorizer
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()

In [9]:
# fit vectorizer to comments text, transform comments text
qc_tfidf = tfidf.fit_transform(qc_pre_vec)

# transform article titles
titles_tfidf = tfidf.transform(titles_pre_vec)

In [10]:
# calculate cosine similarity matrix
sim_counts = pd.DataFrame(cosine_similarity(qc_tfidf, titles_tfidf))

In [11]:
# return article and comment metadata for all pairs with 
# cosine similarity scores > 0.8

count = 0

for i in sim_counts:
    if sim_counts[i].max() > 0.8:
        print(sim_counts[i].max())
        print(df_comments.loc[sim_counts[i].idxmax()])
        print(df_metadata.loc[i], '\n\n')
        
        count+=1
        
print(count)

0.8048369462785332
Unnamed: 0                              24038
article_num                              4293
comment_id                              26575
commenter_name                       duckjibe
comment_published    2017-10-04T11:45:28.000Z
comment_text                         Hublot ?
comment_likes                               0
comment_flag                                0
parent_id                                   0
Name: 4983, dtype: object
article_id                                                         719
article_cat                                              Not Specified
article_title        The Hublot Atelier: The Hublot for when your o...
article_author                                            Felix Scholz
article_published                            2012-10-03 09:00:00-04:00
article_modified                             2017-02-20 21:01:21-05:00
comm_count                                                         -42
word_count                                  