In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json
import nltk
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS
stop = set(stopwords.words('english'))

In [3]:
client = pymongo.MongoClient('localhost', 27017)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

client.database_names(), wiki_db.collection_names()

(['local', 'myWiki', 'wikipedia'], ['my_collection'])

In [4]:
wiki_col.count()

5785

In [5]:
cursor = wiki_col.find()

wiki_df = pd.DataFrame(list(cursor))

In [6]:
wiki_df.drop_duplicates(subset=['page_id'], inplace=True)

## Create models to predict categories

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()
wiki_df['cat_numerical'] = le.fit_transform(wiki_df['main_cat'])

wiki_df['cat_numerical'].value_counts()

0    3124
1    1131
Name: cat_numerical, dtype: int64

In [9]:
wiki_df.head()

Unnamed: 0,_id,article,content,main_cat,page_id,sub_cat,cat_numerical
0,5ae72dda023fe31d68a69cf4,Business software,software make business business sell softwar...,Business software,1037763,Business software,0
1,5ae72ddb023fe31d68a69cf5,AccuSystems,multiple issue orphan date february notabili...,Business software,41270069,Business software,0
2,5ae72ddb023fe31d68a69cf6,Active policy management,active policy management business orient ent...,Business software,5211212,Business software,0
3,5ae72ddb023fe31d68a69cf7,Alexandria (library software),use alexandria alexandria browser base softw...,Business software,28502793,Business software,0
4,5ae72ddb023fe31d68a69cf8,Alteryx,infobox company name alteryx inc logo altery...,Business software,44133735,Business software,0


## Create GS Pipeline with LogisticRegression to predict category

In [13]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib
from sklearn.naive_bayes import BernoulliNB

In [14]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('svd', TruncatedSVD()),
    ('clf', LogisticRegression())
])

params = {
    'vec__ngram_range':[(1,2)],
    'vec__min_df':[10,30],
    'svd__n_components':[100,300,500],
    'clf__C': np.logspace(-2,4,7)
}

nlp_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__ngram_range': [(1, 2)], 'vec__min_df': [10, 30], 'svd__n_components': [100, 300, 500], 'clf__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
lr_gs = nlp_gs.best_estimator_

In [16]:
joblib.dump(lr_gs, 'lr_gs.pkl')

['lr_gs.pkl']

In [17]:
nlp_gs.score(X_test, y_test)

0.98778195488721809

In [18]:
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, classification_report

In [19]:
def score_eval(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        'roc_auc': roc_auc_score(y_test, preds),
        'accuracy': accuracy_score(y_test, preds),
        'confmat': confusion_matrix(y_test, preds),
        'clf_rep': classification_report(y_test, preds)
    }

In [20]:
results = score_eval(nlp_gs, X_test, y_test)

print('-'*60)
for key, value in results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.982618671891
------------------------------------------------------------
accuracy
0.987781954887
------------------------------------------------------------
confmat
[[777   5]
 [  8 274]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       782
          1       0.98      0.97      0.98       282

avg / total       0.99      0.99      0.99      1064

------------------------------------------------------------


In [21]:
def predict_category(source_test, model):
    source_test = [source_test]
   
    predicted = model.predict(source_test)
    predicted_probas = model.predict_proba(source_test)

    return predicted, predicted_probas


In [24]:
# 0 refers to Business Software and 1 refers to Maching Learning 
predict_category('Each row of the matrix represents the instances in a predicted class', lr_gs)

(array([1]), array([[ 0.04643886,  0.95356114]]))

## Create GS Pipeline with BernoulliNB to predict category

In [25]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [26]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()), 
    ('clf', BernoulliNB())
])

params = {
    'vec__min_df':[10,30],
}

nlp_bayes_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_bayes_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ... vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__min_df': [10, 30]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [27]:
bayes_gs = nlp_bayes_gs.best_estimator_

In [28]:
nlp_bayes_gs

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ... vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__min_df': [10, 30]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [29]:
nlp_bayes_gs.score(X_test,y_test)

0.93890977443609025

In [30]:
bayes_results = score_eval(nlp_bayes_gs, X_test, y_test)

print('-'*60)
for key, value in bayes_results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.914227022909
------------------------------------------------------------
accuracy
0.938909774436
------------------------------------------------------------
confmat
[[756  26]
 [ 39 243]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.95      0.97      0.96       782
          1       0.90      0.86      0.88       282

avg / total       0.94      0.94      0.94      1064

------------------------------------------------------------


In [31]:
joblib.dump(bayes_gs, 'nlp_bayes_gs.pkl')

['nlp_bayes_gs.pkl']

In [32]:
predict_category(' It focused on distributed deep learning by partitioning the model\
and data onto nodes in a cluster and parallelize the training', bayes_gs)

(array([0]), array([[  1.00000000e+00,   5.74788946e-13]]))