In [1]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
connection = pg2.connect(host='postgres',
                         user='postgres',
                         database='postgres')
cursor = connection.cursor(cursor_factory=RealDictCursor)

In [3]:
cursor.execute("""SELECT * FROM pages""")

In [4]:
df = pd.DataFrame(cursor.fetchall())
cursor.close()

In [5]:
tfidf_vectorizer = TfidfVectorizer(min_df = 7, stop_words = 'english')
doc_matrix = tfidf_vectorizer.fit_transform(df['page_text'])

In [6]:
tfdf = pd.DataFrame(doc_matrix.todense(), columns=tfidf_vectorizer.get_feature_names())

In [7]:
tfdf.head()

Unnamed: 0,aaai,aaron,ab,abandoned,abb,abbreviated,abbreviation,abc,abilities,ability,...,zk,znumber,zone,zones,zoo,zoom,zos,zsum,zur,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.012862,0.0,0.0,0.0,0.0,0.0,0.035401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
n_components = 10
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [9]:
svd_matrix = SVD.fit_transform(tfdf)

In [10]:
SVD.explained_variance_ratio_

array([ 0.01261873,  0.02707777,  0.011991  ,  0.0115697 ,  0.00995693,
        0.00811719,  0.00645556,  0.00596053,  0.00541973,  0.00492011])

In [11]:
sum(SVD.explained_variance_ratio_)

0.10408725355154724

In [105]:
svd_df = pd.DataFrame(svd_matrix, columns=component_names)

In [106]:
svd_df['article'] = df['page_title'].values

In [107]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,article
0,0.163497,-0.027542,0.274604,0.068514,0.062726,-0.056424,-0.262628,-0.106796,0.00985,0.148719,data exploration
1,0.126187,0.002246,0.222427,0.135866,0.004355,-0.05283,-0.049377,-0.069391,0.12051,0.052665,list of datasets for machine learning research
2,0.306683,0.098547,0.495258,0.358161,-0.046012,-0.120519,0.018436,-0.098835,0.32072,0.014679,machine learning
3,0.298169,0.062502,0.374178,0.341274,-0.171688,-0.090679,0.141812,-0.038542,0.346027,-0.009102,outline of machine learning
4,0.08671,0.037911,0.123177,0.100964,-0.03888,-0.025062,0.056273,0.054523,-0.047988,-0.065581,singular statistical model


In [15]:
svd_df[['component_1', 'article']].sort_values('component_1', ascending=False).head(10)

Unnamed: 0,component_1,article
920,0.692897,forwardbackward algorithm
967,0.646946,examples of markov chains
987,0.576982,models of dna evolution
468,0.57653,probit model
756,0.573151,cmaes
2128,0.571834,bonita bpm
886,0.571734,partial least squares regression
1140,0.571006,generalized distributive law
256,0.565583,bidirectional associative memory
317,0.565467,multimodal learning


In [40]:
search_term = 'machine learning search term'
search_term2 = tfidf_vectorizer.transform(pd.Series(search_term))

In [63]:
test = SVD.transform(search_term2)

In [64]:
arr = svd_df[['component_{}'.format(i+1) for i in range(len(component_names))]].values[0]

In [78]:
len(svd_df[['component_{}'.format(i+1) for i in range(len(component_names))]].values)

2654

In [69]:
np.sqrt(np.sum((test - arr)**2))

0.48819259520308489

In [111]:
def evaluate_distance(search_term):
    
    # transform the search term
    search_tfdif = tfidf_vectorizer.transform(pd.Series(search_term)) # keeps search term as one word
    svd_search_vec = SVD.transform(search_tfdif)
    
    dataframe = svd_df.copy()
    distances = []
    
    for i in range(len(svd_matrix)):
        arr =  svd_matrix[i]
        distance = np.sqrt(np.sum((svd_search_vec - arr)**2))
        distances.append(distance)
    
    dataframe['distance'] = distances
    
    return dataframe[['article', 'distance']].sort_values('distance', ascending=True).head(5)

In [112]:
evaluate_distance(search_term)

Unnamed: 0,article,distance
919,errordriven learning,0.049287
91,inferential theory of learning,0.091418
145,offline learning,0.09291
14,apprenticeship learning,0.106668
1042,ian goodfellow,0.115747


In [113]:
evaluate_distance('neural network')

Unnamed: 0,article,distance
380,encog,0.182516
309,lernmatrix,0.22447
333,physical neural network,0.227901
353,spiking neural network,0.233184
265,computational neurogenetic modeling,0.245104
