In [1]:
!conda install --yes --quiet pymongo



# All requested packages already installed.
# packages in environment at /opt/conda:
#
pymongo                   3.4.0                    py36_0    defaults


In [5]:
import pymongo
from sklearn.externals import joblib

In [6]:
client = pymongo.MongoClient('54.214.155.118', 27016)

In [7]:
client.database_names()

['admin', 'local', 'my_database', 'test', 'wiki']

## My stuff

In [9]:
wiki = client['wiki']

In [55]:
client.database_names()

['admin', 'local', 'my_database', 'test', 'wiki']

In [61]:
wiki['ML_new']
wiki['BS_new']

Collection(Database(MongoClient(host=['54.214.155.118:27016'], document_class=dict, tz_aware=False, connect=True), 'wiki'), 'BS_new')

In [64]:
wiki.collection_names()

['BS', 'ML']

In [10]:
ML_ref = wiki['ML_new']
BS_ref = wiki['BS_new']

In [12]:
BS_docs = list(BS_ref.find())
len(BS_docs)

3017

In [13]:
ML_docs = list(ML_ref.find())
len(ML_docs)

1108

In [67]:
bs_docs = joblib.load('bs_pages_info.pkl')
len(bs_docs)

3017

In [68]:
ml_docs = joblib.load('ml_pages_info.pkl')
len(ml_docs)

1108

In [69]:
BS_ref.insert_many(bs_docs) # DON'T RUN AGAIN!

<pymongo.results.InsertManyResult at 0x7fd9a8a22318>

In [70]:
ML_ref.insert_many(ml_docs) # DON'T RUN AGAIN!

<pymongo.results.InsertManyResult at 0x7fd9a9490ea0>

In [71]:
len(BS_docs)

0

In [72]:
len(ML_docs)

0

In [14]:
test_df = pd.DataFrame(ML_docs[1:10], index = [x['pageid'] for x in ML_docs[1:10]]).drop(['_id', 'pageid'], axis = 1)

In [15]:
test_df.shape

(9, 2)

In [16]:
ml_df = pd.DataFrame(ML_docs, index = [x['pageid'] for x in ML_docs]).drop(['_id', 'pageid'], axis=1)

In [17]:
ml_df['label'] = 'ML'

In [18]:
ml_df.shape

(1108, 3)

In [19]:
ml_df.head()

Unnamed: 0,text,title,label
28168154,Principal stratification is a statistical tech...,Principal stratification,ML
31877832,This article is about the binary tree variant....,Ball tree,ML
2139778,"Ordination or gradient analysis, in multivaria...",Ordination (statistics),ML
3119343,"In computational learning theory, sample exclu...",Sample exclusion dimension,ML
17110513,"In statistics, additive smoothing, also called...",Additive smoothing,ML


In [20]:
bs_df = pd.DataFrame(BS_docs, index = [x['pageid'] for x in BS_docs]).drop(['_id', 'pageid'], axis=1)

In [21]:
bs_df['label'] = 'BS'

In [22]:
bs_df.head()

Unnamed: 0,text,title,label
22847264,"Application retirement, also called applicatio...",Application retirement,BS
317400,This article relies too much on references to ...,WebObjects,BS
6708405,This article does not cite any sources. Please...,Zoo Tycoon 2: Marine Mania,BS
22479089,"HubSpot, Inc.TypePublicTraded&#160;asNYSE:&#16...",HubSpot,BS
2305988,This article needs additional citations for ve...,FitNesse,BS


In [23]:
all_df = pd.concat([ml_df, bs_df])

In [24]:
all_df.sample(5)

Unnamed: 0,text,title,label
53021161,"In computer science, SimHash is a technique fo...",SimHash,ML
21164918,"This article is an orphan, as no other article...",Touch Typist Typing Tutor,BS
45086251,This article has multiple issues. Please help ...,Savane (software),BS
41102584,Moka5TypePrivateIndustryEnterprise softwareFou...,Moka5,BS
45265646,This article has multiple issues. Please help ...,OpenHospital,BS


In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [26]:
import re
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    return text

In [27]:
all_df['clean_text'] = all_df['text'].apply(cleaner)

In [28]:
all_df.head()

Unnamed: 0,text,title,label,clean_text
28168154,Principal stratification is a statistical tech...,Principal stratification,ML,principal stratification is a statistical tech...
31877832,This article is about the binary tree variant....,Ball tree,ML,this article is about the binary tree variant ...
2139778,"Ordination or gradient analysis, in multivaria...",Ordination (statistics),ML,ordination or gradient analysis in multivariat...
3119343,"In computational learning theory, sample exclu...",Sample exclusion dimension,ML,in computational learning theory sample exclus...
17110513,"In statistics, additive smoothing, also called...",Additive smoothing,ML,in statistics additive smoothing also called l...


In [81]:
vect = TfidfVectorizer(min_df=2, stop_words='english')
vect.fit_transform(all_df['clean_text'])

<4125x57212 sparse matrix of type '<class 'numpy.float64'>'
	with 1367246 stored elements in Compressed Sparse Row format>

In [82]:
all_text = pd.DataFrame(X.toarray(), index = all_df.index, columns=vect.get_feature_names())

In [31]:
vector_text_df = all_text
vector_text_df.head()

Unnamed: 0,aa,aaa,aaai,aaas,aab,aac,aachen,aaf,aai,aal,...,zworykintransmissionmediacoaxial,zx,zxtimes,zxy,zybex,zynga,zynx,zytkow,zzap,zzldots
28168154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31877832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2139778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3119343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17110513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
from sklearn.decomposition import TruncatedSVD

In [76]:
SVD = TruncatedSVD(n_components=500)
component_names = ["component_"+str(i+1) for i in range(500)]

In [77]:
svd_matrix = SVD.fit_transform(vector_text_df)

In [79]:
SVD

TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5,
       random_state=None, tol=0.0)

In [90]:
LSA = pd.DataFrame(svd_matrix, index = vector_text_df.index, columns = component_names)

LSA_df = LSA

LSA_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_491,component_492,component_493,component_494,component_495,component_496,component_497,component_498,component_499,component_500
28168154,0.098349,0.016547,0.001384,0.043803,0.047447,0.009481,0.050307,0.012112,-0.022939,0.028155,...,-0.011418,0.035704,0.02,-0.010939,-0.019325,0.02578,0.005823,0.01061,-0.026668,-0.017207
31877832,0.148508,0.152856,-0.035552,0.043294,0.077943,0.014639,-0.02065,0.023356,-0.063215,0.060528,...,0.013252,-0.003817,-0.02206,-0.00626,0.006105,-0.010418,-0.021794,0.019479,-0.00064,0.02364
2139778,0.127671,0.031231,-0.029331,0.034437,0.075613,-0.001592,0.00852,0.008145,-0.047915,0.037267,...,0.005993,0.036297,0.017374,-0.017606,-0.04113,-0.021373,-0.008448,-0.03904,0.035061,0.003333
3119343,0.181125,0.09099,0.042595,0.104461,0.097508,-0.039418,0.070308,-0.018137,0.019616,-0.011421,...,-0.010036,-0.006405,-0.006184,0.00716,-0.017233,0.004717,0.010312,0.017181,-0.019619,-0.001486
17110513,0.1366,0.147564,-0.038037,0.034879,0.077761,0.01429,-0.019275,0.018444,-0.036488,0.05189,...,-0.010516,-0.019475,-0.007933,-0.012813,-0.002057,0.023827,-0.025305,-0.012667,0.015898,0.005096


In [69]:
test_query = "vectors and matrices"

In [83]:
test_query_tfdif = vect.transform([test_query])

In [84]:
test_query_tfdif.shape

(1, 57212)

In [86]:
query500 = SVD.transform(test_query_tfdif.todense().reshape(1,-1))

In [91]:
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
LSA_df['cosine_sim'] = cosine_similarity(LSA_df, query500)

In [1]:
LSA_df[['cosine_sim']].sort_values('cosine_sim', ascending=False).head(10)

NameError: name 'LSA_df' is not defined