In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy import linalg as LA
import json
import math
from sklearn.decomposition import PCA
import re
from numpy import linalg as LA
import pickle

In [31]:
def tokenize(text):
    """Returns a list of words that make up the text.
    
    Note: for simplicity, lowercase everything.
    Requirement: Use Regex to satisfy this function
    
    Arguments
    =========
    text: String
    stem_words: boolean
    
    Returns
    =======
    List
    
    """
    regex = r"([a-zA-Z]+)"
    tokens = re.findall(regex, text.lower())
    return tokens

In [51]:
#doc_by_vocab = np.empty([len(data), n_feats])

def build_vectorizer(stop_words, max_df=1.0, min_df=8, norm='l2', max_features=None, tokenizer=None):
    """Returns a TfidfVectorizer object with the above preprocessing properties.
    
    Note: This function may log a deprecation warning. This is normal, and you
    can simply ignore it.
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    # YOUR CODE HERE
    return TfidfVectorizer(stop_words=stop_words,
                           max_df=max_df, min_df=min_df, 
                           max_features=max_features, norm=norm,
                          tokenizer = tokenize)

In [33]:
with open('final_data4.json', encoding = 'utf-8') as json_file:
    data = json.load(json_file)

In [52]:
tfidf_vec = build_vectorizer("english")

In [53]:
doc_by_vocab = tfidf_vec.fit_transform([d['transcript'] for d in data]).toarray()

In [54]:
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}

In [55]:
doc_by_vocab.shape

(3232, 10451)

In [78]:
pca = PCA(.95)
pca_model = pca.fit(doc_by_vocab)

In [79]:
svd_data = pca.transform(doc_by_vocab)

In [80]:
svd_data.shape

(3232, 2350)

In [88]:
svd_norms = np.zeros(len(svd_data))
for i in range(len(svd_data)):
    svd_norms[i] = LA.norm(svd_data[i,:])

In [91]:
with open('transcript_idf_values.json', encoding = 'utf-8') as json_file:
    idf_values = json.load(json_file)

In [84]:
def search_svd_method(pca_model, input_svd_data, input_svd_norms, index_to_data, input_idf, query, tokenize_method):
    n_docs = input_svd_data.shape[0]
    n_types = len(index_to_data)
    n_principal_components = input_svd_data.shape[1]
    q_vector = np.zeros(n_types)
    tokens = tokenize_method(query)
    for t in set(tokens):
        index = index_to_data.get(t)
        idf = input_idf.get(t)
        if (index is not None) & (idf is not None):
            doc_freq = tokens.count(t)
            q_vector[index] = doc_freq*idf
    q_vector_transformed = pca_model.transform(q_vector.reshape(1,-1))
    sim = []
    for doc_id in range(n_docs):
        cos_sim = get_cos_similarity(q_vector_transformed, doc_id, input_svd_data, input_svd_norms)
        sim.append((cos_sim, doc_id))
    return sim

In [104]:
def get_cos_similarity(q_vec, doc_id, input_svd_data, input_svd_norms):
    d_vec = input_svd_data[doc_id,:]
    d_norm = input_svd_norms[doc_id]
    q_norm = LA.norm(q_vec)
    return q_vec.dot(d_vec)/(d_norm*q_norm)

In [105]:
def get_top_k(results, k, input_data):
    results.sort(key = lambda x: x[0], reverse = True)
    output = []
    for i in range(min(k, len(results))):
        doc_id = results[i][1]
        doc_info = {'title' : input_data[doc_id].get('title'), 'url': input_data[doc_id].get('url'),
                    'date':input_data[doc_id].get('date'), 'score':input_data[doc_id].get('score'),
                    'source':input_data[doc_id].get('source'), 'summary':input_data[doc_id].get('summary')}
        output.append(doc_info)
    return output

In [110]:
query = 'pet'
results = search_svd_method(pca_model, svd_data, svd_norms, index_to_vocab, idf_values, query, tokenize)

In [111]:
results

[(array([0.01788311]), 0),
 (array([0.0660759]), 1),
 (array([0.04628294]), 2),
 (array([-0.08357987]), 3),
 (array([0.04707628]), 4),
 (array([0.04521946]), 5),
 (array([0.07700284]), 6),
 (array([0.06924151]), 7),
 (array([0.06013429]), 8),
 (array([-0.05911186]), 9),
 (array([-0.06358188]), 10),
 (array([-0.03905428]), 11),
 (array([-0.00177065]), 12),
 (array([0.01349015]), 13),
 (array([-0.04053888]), 14),
 (array([0.01504541]), 15),
 (array([0.00311692]), 16),
 (array([-0.02106413]), 17),
 (array([-0.03839811]), 18),
 (array([-0.0344315]), 19),
 (array([6.46476482e-05]), 20),
 (array([-0.02150321]), 21),
 (array([-0.10463592]), 22),
 (array([-0.00758377]), 23),
 (array([-0.00305172]), 24),
 (array([0.02122186]), 25),
 (array([-0.0480276]), 26),
 (array([-0.04116129]), 27),
 (array([-0.01056529]), 28),
 (array([-0.03425234]), 29),
 (array([-0.06484983]), 30),
 (array([0.04479503]), 31),
 (array([0.02458493]), 32),
 (array([0.00397783]), 33),
 (array([0.05573812]), 34),
 (array([0.

In [112]:
get_top_k(results, 20, data)

[{'title': 'A lady I know found her birth family, and your not going to believe the details!',
  'url': 'https://www.hoosiertimes.com/herald_times_online/news/local/bloomington-woman-reunites-with-the-family-she-thought-had-abandoned/article_05992a47-4ee5-57f4-9104-6143068befad.html',
  'date': '2019-05-01',
  'score': 0,
  'source': 'reddit.com/r/UpliftingNews',
  'summary': 'Winds WSW at 10 to 20 mph..  Tonight  Cloudy skies this evening will become partly cloudy after midnight.'},
 {'title': 'Indiana woman reunites with the family she thought had abandoned her 50 years ago',
  'url': 'https://www.hoosiertimes.com/herald_times_online/news/local/bloomington-woman-reunites-with-the-family-she-thought-had-abandoned/article_05992a47-4ee5-57f4-9104-6143068befad.html',
  'date': '2019-05-01',
  'score': 16,
  'source': 'reddit.com/r/UpliftingNews',
  'summary': 'Winds WSW at 10 to 20 mph..  Tonight  Cloudy skies this evening will become partly cloudy after midnight.'},
 {'title': 'How Comm

In [65]:
filename = 'pca_model.sav'
pickle.dump(pca, open(filename, 'wb'))

In [66]:
loaded_model = pickle.load(open(filename, 'rb'))

In [83]:
test = np.zeros(10451).reshape(1,-1)
test
pca_model.transform(test)

array([[ 7.38491966e-02, -3.49955693e-02, -6.83526914e-02, ...,
         9.03511900e-04, -5.01425849e-04, -6.41483561e-05]])

In [68]:
test.shape

(3232, 2350)

In [None]:
get