In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json

from nltk.corpus import stopwords
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [3]:
client = pymongo.MongoClient('localhost', 27017)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

In [4]:
client.database_names(), wiki_db.collection_names()

(['local', 'myWiki', 'wikipedia'], ['my_collection'])

In [5]:
wiki_col.count()

5785

In [6]:
cursor = wiki_col.find()

In [7]:
wiki_df = pd.DataFrame(list(cursor))

In [8]:
wiki_df['main_cat'].value_counts()

Business software    4184
Machine learning     1601
Name: main_cat, dtype: int64

In [9]:
wiki_df.head()

Unnamed: 0,_id,article,content,main_cat,page_id,sub_cat
0,5ae72dda023fe31d68a69cf4,Business software,software make business business sell softwar...,Business software,1037763,Business software
1,5ae72ddb023fe31d68a69cf5,AccuSystems,multiple issue orphan date february notabili...,Business software,41270069,Business software
2,5ae72ddb023fe31d68a69cf6,Active policy management,active policy management business orient ent...,Business software,5211212,Business software
3,5ae72ddb023fe31d68a69cf7,Alexandria (library software),use alexandria alexandria browser base softw...,Business software,28502793,Business software
4,5ae72ddb023fe31d68a69cf8,Alteryx,infobox company name alteryx inc logo altery...,Business software,44133735,Business software


In [10]:
wiki_df.drop_duplicates(subset=['page_id'], inplace=True)

## Use TFIDF to vectorize words

In [11]:
tfidf_vectorizer = TfidfVectorizer(min_df = 20, stop_words = 'english')

article_term_matrix_sps = tfidf_vectorizer.fit_transform(wiki_df.content)

article_term_matrix_df = pd.DataFrame(article_term_matrix_sps.toarray(),
                                       index=wiki_df.index,
                                       columns=tfidf_vectorizer.get_feature_names())

In [12]:
article_term_matrix_df.head()


Unnamed: 0,aa,aaai,aaron,ab,abandon,abbrev,abbreviate,abbreviation,ability,able,...,zero,zhang,zhou,zip,zoho,zone,zoo,zoom,zope,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# pd.concat([wiki_df.article, wiki_df.content, article_term_matrix_df], axis=1).sample(4)

## Use SVD to reduce number of features

In [14]:
from sklearn.decomposition import TruncatedSVD

In [15]:
n_components = 500
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [16]:
svd_matrix = SVD.fit_transform(article_term_matrix_df)

In [17]:
sum(SVD.explained_variance_ratio_)

0.64156935538408566

In [18]:
svd_df = pd.DataFrame(svd_matrix,
                      index=article_term_matrix_df.index,
                      columns=component_names)
svd_df['article'] = wiki_df.article

vocabulary_expression = pd.DataFrame(SVD.components_,
                                     index=component_names,
                                     columns=tfidf_vectorizer.get_feature_names()).T

In [19]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_492,component_493,component_494,component_495,component_496,component_497,component_498,component_499,component_500,article
0,0.407152,-0.061202,-0.050746,0.106423,0.128751,-0.037402,-0.258355,-0.062226,0.006932,-0.082674,...,0.028478,-0.004628,0.027605,0.006093,-0.00717,-0.026615,0.029868,0.009895,0.011481,Business software
1,0.385392,-0.062778,-0.175276,0.279622,-0.052387,-0.061371,0.028299,-0.030311,-0.041162,0.07293,...,0.016704,0.009657,-0.005092,-0.02354,0.016636,-0.015383,0.007208,0.007222,-0.01823,AccuSystems
2,0.18541,-0.030271,-0.017597,0.069564,0.0995,0.038695,-0.052429,0.024344,-0.044205,-0.063128,...,-0.010909,0.019818,-0.005692,-0.004664,0.05621,-0.04075,-0.003651,0.030794,0.032386,Active policy management
3,0.221834,-0.038207,-0.036519,-0.018732,0.003068,-0.001906,0.008677,-0.014487,-0.001963,-0.012935,...,-0.015089,0.03812,0.025127,-0.034232,0.024791,-0.002765,-0.0053,-0.042532,0.003596,Alexandria (library software)
4,0.266758,-0.037781,-0.054639,0.228771,-0.04075,-0.050478,0.05769,-0.01886,-0.067791,0.042618,...,-0.003931,-0.017374,0.000178,-0.021236,-0.011524,0.003003,0.000365,0.000252,-0.025003,Alteryx


In [20]:
for i in range(1,11):
    vocabulary_expression['abs_component_{}'.format(i)] = \
    np.abs(vocabulary_expression['component_{}'.format(i)])

In [21]:
vocabulary_expression['abs_component_1'].sort_values(ascending=False).head(7)

software      0.291182
company       0.180182
management    0.168654
category      0.168381
game          0.140527
com           0.136635
http          0.134051
Name: abs_component_1, dtype: float64

## Create function to search for top 5 related articles

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
def search_for_pages(search_terms):
    '''
    Takes search terms and returns the top 5 articles within the wikipedia corpus 
    that relate to that search terms based on cosine similarity.
    
    Params
    ------
    search_terms: str
    A string of words  
    
    Returns
    -------
    A Dataframe of the top 5 articles with the highest cosine similarities.
     
    '''
    
    temp_svd_df = svd_df.copy()
    
    search_terms = [search_terms]

    search_terms_encoded = tfidf_vectorizer.transform(search_terms)
    
    search_term_svd_vector = SVD.transform(search_terms_encoded)
    
    temp_svd_df['cosine_sim'] = cosine_similarity(temp_svd_df.drop('article', axis=1), search_term_svd_vector)
    
    return temp_svd_df[['article', 'cosine_sim']].sort_values('cosine_sim', ascending=False).head(5)

In [24]:
search_for_pages('There are two types of investor apps: Native investor apps and HTML5 investor apps. Most investor apps offer access to public company content such as stock quotes, corporate materials')

Unnamed: 0,article,cosine_sim
2898,Investor application,0.732956
287,Zuora,0.419735
107,Fundamental Analysis Software,0.395689
854,Moody's Corporation,0.37797
1644,FatKat (investment software),0.365525


In [25]:
search_for_pages('Artificial intelligence')

Unnamed: 0,article,cosine_sim
4990,AAAI Conference on Artificial Intelligence,0.814546
4993,Conference on Artificial General Intelligence,0.68528
5677,Jürgen Schmidhuber,0.645105
4996,Dartmouth workshop,0.638792
5002,International Joint Conference on Artificial I...,0.625315
