In [2]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:
connection = pg2.connect(host='postgres',
                         user='postgres',
                         database='postgres')
cursor = connection.cursor(cursor_factory=RealDictCursor)

In [4]:
cursor.execute("""SELECT * FROM pages""")

In [5]:
df = pd.DataFrame(cursor.fetchall())
cursor.close()

In [6]:
tfidf_vectorizer = TfidfVectorizer(min_df = 7, stop_words = 'english')
doc_matrix = tfidf_vectorizer.fit_transform(df['page_text'])

In [7]:
tfdf = pd.DataFrame(doc_matrix.todense(), columns=tfidf_vectorizer.get_feature_names())

In [8]:
tfdf.head()

Unnamed: 0,aaai,aaron,ab,abandoned,abb,abbreviated,abbreviation,abc,abilities,ability,...,zk,znumber,zone,zones,zoo,zoom,zos,zsum,zur,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.012862,0.0,0.0,0.0,0.0,0.0,0.035401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
n_components = 10
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [10]:
svd_matrix = SVD.fit_transform(tfdf)

In [11]:
SVD.explained_variance_ratio_

array([ 0.01261873,  0.02707777,  0.01199098,  0.01156954,  0.009957  ,
        0.00811692,  0.00645685,  0.00596301,  0.00540565,  0.00491853])

In [12]:
sum(SVD.explained_variance_ratio_)

0.10407499186148052

In [13]:
svd_df = pd.DataFrame(svd_matrix, columns=component_names)

In [14]:
svd_df['article'] = df['page_title'].values

In [15]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,article
0,0.163497,-0.027543,0.274572,0.068424,0.063024,-0.056614,-0.259646,-0.107149,0.025732,0.163337,data exploration
1,0.126187,0.002246,0.222394,0.135766,0.004412,-0.052737,-0.048694,-0.069047,0.129668,0.059332,list of datasets for machine learning research
2,0.306683,0.098547,0.49521,0.358147,-0.046084,-0.119613,0.017965,-0.096993,0.322916,0.016598,machine learning
3,0.298169,0.062502,0.374157,0.341144,-0.171761,-0.089531,0.140898,-0.0387,0.352937,-0.002627,outline of machine learning
4,0.08671,0.037911,0.123212,0.100944,-0.038818,-0.024713,0.05597,0.054479,-0.047953,-0.058698,singular statistical model


In [16]:
svd_df[['component_1', 'article']].sort_values('component_1', ascending=False).head(10)

Unnamed: 0,component_1,article
920,0.692897,forwardbackward algorithm
967,0.646946,examples of markov chains
987,0.576982,models of dna evolution
468,0.57653,probit model
756,0.573151,cmaes
2128,0.571834,bonita bpm
886,0.571734,partial least squares regression
1140,0.571006,generalized distributive law
256,0.565583,bidirectional associative memory
317,0.565467,multimodal learning


In [17]:
search_term = 'machine learning search term'
search_term2 = tfidf_vectorizer.transform(pd.Series(search_term))

In [18]:
test = SVD.transform(search_term2)

In [19]:
arr = svd_df[['component_{}'.format(i+1) for i in range(len(component_names))]].values[0]

In [20]:
len(svd_df[['component_{}'.format(i+1) for i in range(len(component_names))]].values)

2654

In [21]:
np.sqrt(np.sum((test - arr)**2))

0.48238916464728276

In [22]:
import re
def cleaner(message):
    message = re.sub('\.+', ' ', message)
    message = re.sub('[^a-z0-9 ]','', message.lower())
    message = re.sub('\d+','NUMBER ',message)
    message = re.sub('\s+',' ',message)
    return message

In [23]:
def evaluate_distance(search_term):
    
    search_term = cleaner(search_term)
    # transform the search term
    search_tfdif = tfidf_vectorizer.transform(pd.Series(search_term)) # keeps search term as one word
    svd_search_vec = SVD.transform(search_tfdif)
    
    dataframe = svd_df.copy()
    distances = []
    
    for i in range(len(svd_matrix)):
        arr =  svd_matrix[i]
        distance = np.sqrt(np.sum((svd_search_vec - arr)**2))
        distances.append(distance)
    
    dataframe['distance'] = distances
    
    return dataframe[['article', 'distance']].sort_values('distance', ascending=True).head(5)

In [24]:
evaluate_distance(search_term)

Unnamed: 0,article,distance
919,errordriven learning,0.050469
145,offline learning,0.092495
91,inferential theory of learning,0.09355
14,apprenticeship learning,0.111927
1042,ian goodfellow,0.115343


In [25]:
evaluate_distance('neural network')

Unnamed: 0,article,distance
380,encog,0.181131
309,lernmatrix,0.226026
333,physical neural network,0.226032
353,spiking neural network,0.231035
265,computational neurogenetic modeling,0.246159
