In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
import pymongo

In [86]:
import re
def cleaner(text):
    '''
    Cleans raw text using regex.
    '''
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    return text

In [87]:
def wiki_df(host, port=27016):
    '''
    Connects to a MongoDB and creates a combined dataframe of the ML and BS docs. 
    Host may change if MongoDB AWS instance is rebooted.
    '''
    client = pymongo.MongoClient(host, port)
    
    BS_collection = client['wiki']['BS_new']
    ML_collection = client['wiki']['ML_new']
    
    BS_docs = list(BS_collection.find())
    ML_docs = list(ML_collection.find())
    
    BS_df = pd.DataFrame(BS_docs, index = [x['pageid'] for x in BS_docs]).drop(['_id', 'pageid'], axis=1)
    ML_df = pd.DataFrame(ML_docs, index = [x['pageid'] for x in ML_docs]).drop(['_id', 'pageid'], axis=1)
    
    all_df = pd.concat([BS_df, ML_df])
    
    all_df['clean_text'] = all_df['text'].apply(cleaner)
    
    return all_df

In [None]:
pipe = Pipeline([('tfidf', TfidfVectorizer(min_df=2, stop_words='english')),
                 ('SVD', TruncatedSVD(n_components=500)),])
    
original_fit = pipe.fit_transform(all_df['clean_text'])


def similarity(query, df=all_df, original_fit=original_fit, pipe=pipe, max_results=10):
    '''
    Takes a search query and returns a DataFrame with the top related Wikipedia ML and BS articles.
    '''
    component_names = ["component_"+str(i+1) for i in range(500)]
    
    lsa_df = pd.DataFrame(original_fit, index = all_df['clean_text'].index, columns = component_names)

    fitted_query = pipe.transform([query])
    
    lsa_df['cosine_sim'] = cosine_similarity(lsa_df, fitted_query)
    
    top_similar = lsa_df[['cosine_sim']].sort_values('cosine_sim', ascending=False).head(max_results)
    
    top_similar['title'] = [df.loc[x]['title'] for x in top_similar.index]
    
    return top_similar

In [88]:
all_df = wiki_df('54.200.188.121')

In [96]:
all_df.head()

Unnamed: 0,text,title,clean_text
22847264,"Application retirement, also called applicatio...",Application retirement,application retirement also called application...
317400,This article relies too much on references to ...,WebObjects,this article relies too much on references to ...
6708405,This article does not cite any sources. Please...,Zoo Tycoon 2: Marine Mania,this article does not cite any sources please ...
22479089,"HubSpot, Inc.TypePublicTraded&#160;asNYSE:&#16...",HubSpot,hubspot inctypepublictradedasnysehubsindustrys...
2305988,This article needs additional citations for ve...,FitNesse,this article needs additional citations for ve...


In [102]:
similarity('zoo tycoon', all_df)

Unnamed: 0,cosine_sim,title
2309926,0.960124,Zoo Tycoon (2001 video game)
13564180,0.958232,Zoo Tycoon 2 DS
11463651,0.951826,Zoo Tycoon
6708405,0.944178,Zoo Tycoon 2: Marine Mania
9069467,0.931415,Blue Fang Games
6106017,0.927113,Zoo Tycoon 2: Dino Danger Pack
6453530,0.919433,Zoo Tycoon 2: African Adventure
2967411,0.914157,Zoo Tycoon 2
39745663,0.907927,Zoo Tycoon (2013 video game)
4456386,0.906346,Zoo Tycoon DS


In [103]:
similarity('bayes', all_df, max_results=15)

Unnamed: 0,cosine_sim,title
87339,0.569956,Naive Bayes classifier
24455245,0.554892,Averaged one-dependence estimators
22212276,0.429424,Ensemble learning
54033657,0.388875,Labeled data
48643701,0.381272,Out-of-bag error
4195092,0.360906,Relevance vector machine
55075082,0.360284,BigDL
1307911,0.354979,Bootstrap aggregating
42579971,0.317765,Inductive probability
12114107,0.305726,Logistic model tree


In [105]:
similarity('tiger')

Unnamed: 0,cosine_sim,title
9080971,0.441363,Designer's World
37420509,0.417014,Reminders (Apple)
51758248,0.347361,AnswerDash
19174035,0.334583,IGG Software
23759637,0.329715,List of animals in Zoo Tycoon 2
51796761,0.310422,Booyami
6454658,0.278799,Zoo Tycoon 2: Endangered Species
2967411,0.238399,Zoo Tycoon 2
6086099,0.227326,OmniPlan
3558553,0.218685,Heart of Africa


In [107]:
similarity('artificial intelligence')

Unnamed: 0,cosine_sim,title
38629606,0.692122,Conference on Artificial General Intelligence
36494971,0.684455,Mexican International Conference on Artificial...
4420730,0.665359,Marcus Hutter
3358541,0.644081,European Conference on Artificial Intelligence
1124646,0.615157,Dartmouth workshop
37652061,0.593001,Stochastic neural analog reinforcement calculator
2614944,0.552343,International Joint Conference on Artificial I...
13428111,0.502356,Ben Goertzel
405484,0.492614,Jürgen Schmidhuber
49051616,0.486996,Shane Legg
