## Part II - Latent Semantic Analysis to Grab Related Articles Given a Query

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
from os import chdir
chdir('/home/jovyan/')

In [5]:
import library.db_helper as db
import library.functions as fy

### Grab Text from the Page table

In [6]:
query = '''
SELECT text
FROM page
'''

X = db.query_to_dataframe(query)

In [7]:
X.sample(5)

Unnamed: 0,text
1768,visionplus is a financial software application...
808,léon bottou is a researcher best known for his...
72,this article may be in need of reorganization ...
1782,this article has multiple issues please help i...
2409,expert choiceindustrycomputer softwarefounded ...


In [8]:
query = '''
SELECT * 
FROM page 
'''

page = db.query_to_dataframe(query)

### Setup Latent Semantic Analysis - Article Modeling

![](http://interactive.blockdiag.com/image?compression=deflate&encoding=base64&src=eJxLyslPzk7JTExXqOZSUFAqSixXSEksSVRS0LVTUCpILCpOhTCTc1IT85SsQWogTLBoSGpRrq5LfnJpbmpeiUJMnm9iSVFmBUQZDjmwvuAwFwjDM68ktaigKBWkICi1uDSnpBiouxYAYKwuOg)

#### 1. Vetorize the Search Term and the Corpus (X['text'])

In [9]:
def tfidf_vectorizer (search_query, min_df=1):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', min_df=min_df)
    doc_term_matrix = tfidf_vec.fit_transform(X['text'])
    search_query_vec = tfidf_vec.transform([search_query])
    return doc_term_matrix, search_query_vec

In [10]:
tfidf_vectorizer('oracle', min_df=2)

(<2449x34118 sparse matrix of type '<class 'numpy.float64'>'
 	with 702977 stored elements in Compressed Sparse Row format>,
 <1x34118 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>)

#### 2. Use SVD to reduce dimensionality for the sparse document matrix

In [11]:
def SVD_lsa (search_query, n_components=300, min_df=1):
    SVD = TruncatedSVD(n_components=n_components)
    doc_term_matrix, search_query_vec = tfidf_vectorizer (search_query, min_df = min_df)
    
    lsa_doc_term = SVD.fit_transform(doc_term_matrix)
    search_query_lsa = SVD.transform(search_query_vec)
    
    return lsa_doc_term, search_query_lsa

In [12]:
SVD_lsa ('oracle', min_df=2, n_components=400)

(array([[ 0.22806814, -0.06748126, -0.10427892, ..., -0.03173541,
         -0.00188687,  0.01658263],
        [ 0.26614059, -0.01391332, -0.0579378 , ..., -0.00394924,
         -0.02921368, -0.02485575],
        [ 0.26572805,  0.08273001, -0.2099025 , ...,  0.00738858,
          0.02448877,  0.00697621],
        ..., 
        [ 0.09133903, -0.04684297, -0.01136924, ...,  0.01823165,
          0.0355533 , -0.00489408],
        [ 0.15998329, -0.08759892,  0.08040765, ..., -0.01623633,
         -0.0209607 ,  0.00490432],
        [ 0.18419962, -0.09739457,  0.15918732, ...,  0.02118281,
         -0.04171515,  0.0096374 ]]),
 array([[  3.62049605e-02,  -2.54471588e-02,  -3.46700288e-02,
          -6.19762523e-02,  -5.46172656e-02,   1.06033842e-01,
          -6.17681939e-02,  -3.70162169e-02,   3.73106627e-02,
           1.81111981e-01,  -1.30278453e-02,  -4.49196707e-02,
           3.91873698e-01,   5.30837360e-01,   3.49651205e-01,
           1.60584131e-01,  -1.43791584e-01,  -2.24848194

#### 3. Apply sklearn's cosine_similiarity to return article matches for the given search term

In [13]:
X.shape # corpus

(2449, 1)

In [14]:
def grab_related_articles (search_query, n_results=5, n_components = 300, min_df=1):
    lsa_doc_term, search_query_lsa = SVD_lsa(search_query, n_components=n_components, min_df=min_df)
    cos_sim_arr = cosine_similarity(lsa_doc_term, search_query_lsa).ravel()
    
    first_term = -1*(n_results) - 1 
    indices = (np.argsort(cos_sim_arr)[:first_term: -1])
    
    while len(list(set(page['title'].iloc[indices]))) < n_results:
        first_term -= 1
        indices = (np.argsort(cos_sim_arr)[:first_term: -1])
    related_articles = list(set(page['title'].iloc[indices]))
    return related_articles   

In [15]:
grab_related_articles("oracle", n_results=10, n_components=400, min_df=2)

['oracle application server',
 'oracle soa suite',
 'oracle fusion middleware',
 'oracle corporation',
 'oracle fusion architecture',
 'oracle enterprise resource planning cloud',
 'oracle policy automation',
 'oracle reports',
 'oracle fusion applications',
 'oracle applications']