In [1]:
%load_ext autoreload
%autoreload 2

# import needed external libraries
import os
import sys
import pandas as pd
import string, re

import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import sklearn

# Include the `src` module in the module path for imports
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.functions import prep

### Import Data

In [3]:
df_comments = pd.read_csv('../../src/data/csv_exports/q_comments.csv')

df_trivia = pd.read_csv('../../src/data/csv_exports/trivia.csv')

df_metadata = pd.read_csv('../../src/data/csv_exports/metadata.csv')

df_brands = pd.read_csv('../../src/data/csv_exports/brands.csv')

In [4]:
titles = df_metadata['article_title']

In [5]:
q_comments = df_comments['comment_text']

In [None]:
# df_comments[df_comments['comment_text'].str.contains('\?')].to_csv('../../src/data/csv_exports/q_comments.csv')

In [None]:
1. Tokenize
2. Set to lowercase
3. Remove stopped words
4. Stem
5. Vectorizer fit to trivia questions
6. Transform question-comments using vectorizer
7. Transform titles using vectorizer
8. Calculate cosine similarity for question-comments and a. trivia questions; b. article titles
9. Evaluate pairs with perfect similarity scores

## Prep to Vectorize

In [68]:
def pre_vec(q_list):
    pre_vec = []
    
    for q in q_list:
        q = prep.tokenize(q)
        q = prep.lowercase(q)
#         q = prep.rem_stop(q)
        q = prep.stem(q)
        q = ' '.join(q)
        
        pre_vec.append(q)
        
    return pre_vec

In [69]:
titles_pre_vec = pre_vec(titles)
qc_pre_vec = pre_vec(q_comments)

## Vectorize

### TF-IDF

In [70]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()

In [71]:
qc_tfidf = tfidf.fit_transform(qc_pre_vec)

titles_tfidf = tfidf.transform(titles_pre_vec)

In [72]:
vocab_tfidf = tfidf.vocabulary_
print(len(vocab_tfidf), type(vocab_tfidf))

20105 <class 'dict'>


### Counts

In [73]:
counts = text.CountVectorizer()

In [74]:
qc_counts = counts.fit_transform(qc_pre_vec)

titles_counts = counts.transform(titles_pre_vec)

In [75]:
vocab = counts.vocabulary_
print(len(vocab), type(vocab))

20105 <class 'dict'>


### Word2Vec

## Cosine Similarities

In [76]:
sim_counts = pd.DataFrame(cosine_similarity(qc_tfidf, titles_tfidf))

In [31]:
sim_counts.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6491,6492,6493,6494,6495,6496,6497,6498,6499,6500
count,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,...,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0,25640.0
mean,0.010701,0.009367,0.004046,0.008931,0.004237,0.004214,0.004676,0.006201,0.012577,0.003802,...,0.007945,0.005524,0.006441,0.002857,0.004846,0.001029,0.003441,0.000733,0.010069,0.000633
std,0.028899,0.021999,0.016515,0.027461,0.02312,0.015202,0.01566,0.020567,0.030597,0.017281,...,0.020198,0.018077,0.025199,0.017155,0.018722,0.011582,0.014452,0.014012,0.03895,0.007694
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.389429,0.453517,0.399001,0.551729,0.524797,0.368106,0.405661,0.294826,0.465648,0.498994,...,0.380016,0.308118,0.474864,0.612766,0.452153,0.522702,0.34907,1.0,0.586449,0.242491


In [80]:
count = 0

for i in sim_counts:
    if sim_counts[i].max() > 0.8:
        print(sim_counts[i].max())
        print(df_comments.loc[sim_counts[i].idxmax()])
        print(df_metadata.loc[i], '\n\n')
        
        count+=1
        
print(count)

0.8048369462785332
Unnamed: 0                              24038
article_num                              4293
comment_id                              26575
commenter_name                       duckjibe
comment_published    2017-10-04T11:45:28.000Z
comment_text                         Hublot ?
comment_likes                               0
comment_flag                                0
parent_id                                   0
Name: 4983, dtype: object
article_id                                                         719
article_cat                                              Not Specified
article_title        The Hublot Atelier: The Hublot for when your o...
article_author                                            Felix Scholz
article_published                            2012-10-03 09:00:00-04:00
article_modified                             2017-02-20 21:01:21-05:00
comm_count                                                         -42
word_count                                  

In [82]:
titles[4428]

'The Plural Of Rolex'

In [84]:
sim_counts[4428].idxmax()

6531

In [87]:
df_comments.loc[6531]

Unnamed: 0                                                       31635
article_num                                                       4428
comment_id                                                       34647
commenter_name                                               multanemo
comment_published                             2017-12-28T13:52:03.000Z
comment_text         The plural of Rolex at an AD when you ask for ...
comment_likes                                                        2
comment_flag                                                         0
parent_id                                                            0
Name: 6531, dtype: object

In [None]:
sim_tf = pd.DataFrame(cosine_similarity(qc_tfidf, qt_tfidf))

In [None]:
sim_tf.describe()

In [None]:
max_c = []
max_c_scores = []

for index, i in enumerate(sims.columns):
    max_c.append(q_comments[sims[i].idxmax()])
    max_c_scores.append(sims[i].max())

In [None]:
for index, i in enumerate(max_c):
    print(trivia_questions[index])
    print(i)
    print('\n')