In [3]:
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
connection = pg2.connect(host='postgres',
                         user='postgres',
                         database='postgres')
cursor = connection.cursor(cursor_factory=RealDictCursor)

In [5]:
# 
cursor.execute("""SELECT * FROM pages p JOIN page_category c ON p.page_id=c.page_id""")

In [6]:
page_cat_df = pd.DataFrame(cursor.fetchall())

In [7]:
page_cat_df.shape

(2658, 5)

In [8]:
duplicate_mask = page_cat_df.duplicated(subset='page_id', keep=False)

page_cat_df.loc[duplicate_mask]

Unnamed: 0,category_id,page_category_id,page_id,page_text,page_title
160,2,2946,41732818,qloo pronounced clue is a company that uses ar...,qloo
161,1,1408,41732818,qloo pronounced clue is a company that uses ar...,qloo
620,2,2900,462546,fico originally fair isaac and company is a da...,fico
621,1,1840,462546,fico originally fair isaac and company is a da...,fico
636,2,2515,12185719,kxen was an american software company which ex...,kxen inc
637,1,1853,12185719,kxen was an american software company which ex...,kxen inc
670,2,2962,36089423,solveit software pty ltd is a provider of adva...,solveit software
671,1,1879,36089423,solveit software pty ltd is a provider of adva...,solveit software


In [9]:
page_cat_df = page_cat_df[~duplicate_mask]

In [10]:
page_cat_df.shape

(2650, 5)

In [11]:
to_vectorize = page_cat_df['page_text']
y = page_cat_df['category_id']

In [12]:
tfidf = TfidfVectorizer(min_df = 7, stop_words='english')
X = tfidf.fit_transform(to_vectorize)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

In [14]:
lr = LogisticRegression()

In [15]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
lr.score(X_test, y_test)

0.97361809045226133

In [17]:
lr.score(X_train, y_train)

0.98327939590075508

In [18]:
from sklearn.metrics import confusion_matrix, precision_score, classification_report

In [19]:
preds = lr.predict(X_test)
print (confusion_matrix(y_test, preds))

[[288  21]
 [  0 487]]


In [20]:
print (classification_report(y_test, preds))

             precision    recall  f1-score   support

          1       1.00      0.93      0.96       309
          2       0.96      1.00      0.98       487

avg / total       0.97      0.97      0.97       796



In [21]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
])

params = {
    'clf__n_estimators': [10, 50, 100],
    'clf__max_depth': [None, 3, 50, 100]
}

rfgs = GridSearchCV(pipe, params, cv = 5, n_jobs=-1)

rfgs.fit(X_train.todense(), y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': [None, 3, 50, 100], 'clf__n_estimators': [10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [22]:
preds = rfgs.predict(X_test.todense())
print (confusion_matrix(y_test, preds))
print (classification_report(y_test, preds))

[[290  19]
 [  5 482]]
             precision    recall  f1-score   support

          1       0.98      0.94      0.96       309
          2       0.96      0.99      0.98       487

avg / total       0.97      0.97      0.97       796



In [23]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

In [24]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [25]:
gbc.score(X_test.toarray(), y_test)

0.95854271356783916

In [26]:
preds = gbc.predict(X_test.todense())
print (confusion_matrix(y_test, preds))
print (classification_report(y_test, preds))

[[283  26]
 [  7 480]]
             precision    recall  f1-score   support

          1       0.98      0.92      0.94       309
          2       0.95      0.99      0.97       487

avg / total       0.96      0.96      0.96       796



In [27]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', GradientBoostingClassifier())
])

params = {
    'clf__n_estimators': [80, 100, 150],
    'clf__max_depth': [1, 2, 3]
}

gb_grid = GridSearchCV(pipe, params, cv =5, n_jobs=-1, verbose=2)

In [28]:
gb_grid.fit(X_train.todense(), y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] clf__max_depth=1, clf__n_estimators=80 ..........................
[CV] clf__max_depth=1, clf__n_estimators=80 ..........................
[CV] clf__max_depth=1, clf__n_estimators=80 ..........................
[CV] clf__max_depth=1, clf__n_estimators=80 ..........................
[CV] ........... clf__max_depth=1, clf__n_estimators=80, total=  53.1s
[CV] clf__max_depth=1, clf__n_estimators=80 ..........................
[CV] ........... clf__max_depth=1, clf__n_estimators=80, total=  54.0s
[CV] ........... clf__max_depth=1, clf__n_estimators=80, total=  56.2s
[CV] clf__max_depth=1, clf__n_estimators=100 .........................
[CV] clf__max_depth=1, clf__n_estimators=100 .........................
[CV] ........... clf__max_depth=1, clf__n_estimators=80, total=  58.5s
[CV] clf__max_depth=1, clf__n_estimators=100 .........................
[CV] ........... clf__max_depth=1, clf__n_estimators=80, total=  53.3s
[CV] clf__max_dep

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 27.3min


[CV] clf__max_depth=3, clf__n_estimators=100 .........................
[CV] ........... clf__max_depth=3, clf__n_estimators=80, total= 4.6min
[CV] clf__max_depth=3, clf__n_estimators=100 .........................
[CV] ........... clf__max_depth=3, clf__n_estimators=80, total= 4.6min
[CV] clf__max_depth=3, clf__n_estimators=100 .........................
[CV] .......... clf__max_depth=3, clf__n_estimators=100, total= 5.7min
[CV] clf__max_depth=3, clf__n_estimators=100 .........................
[CV] .......... clf__max_depth=3, clf__n_estimators=100, total= 5.2min
[CV] clf__max_depth=3, clf__n_estimators=150 .........................
[CV] .......... clf__max_depth=3, clf__n_estimators=100, total= 5.7min
[CV] clf__max_depth=3, clf__n_estimators=150 .........................
[CV] .......... clf__max_depth=3, clf__n_estimators=100, total= 6.7min
[CV] clf__max_depth=3, clf__n_estimators=150 .........................
[CV] .......... clf__max_depth=3, clf__n_estimators=100, total= 5.5min
[CV] c

[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 45.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1...=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': [1, 2, 3], 'clf__n_estimators': [80, 100, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [29]:
gb_grid.score(X_test.todense(), y_test)

0.95854271356783916

In [31]:
preds = gb_grid.predict(X_test.todense())
print (confusion_matrix(y_test, preds))
print (classification_report(y_test, preds))

[[283  26]
 [  7 480]]
             precision    recall  f1-score   support

          1       0.98      0.92      0.94       309
          2       0.95      0.99      0.97       487

avg / total       0.96      0.96      0.96       796



In [35]:
# pd.DataFrame(gb_grid.cv_results_).sort_values('rank_test_score').head(5)

In [69]:
def predict_category(search_query):
    search_query = pd.Series(search_query)
    query_vec = tfidf.transform(search_query)
    prediction = lr.predict(query_vec)
    probas = lr.predict_proba(query_vec)
    if prediction == 1:
        prediction = 'machine learning'
    else:
        prediction = 'business software'
    print ('''this query most likely falls under: {}'''.format(prediction))
    return pd.DataFrame(probas, columns = ['machine learning', 'business software'])

In [72]:
marx = """The Communists are distinguished from the other working-class 
parties by this only: 1. In the national struggles of the proletarians of 
the different countries, they point out and bring to the front the common 
interests of the entire proletariat, independently of all nationality. 2.
In the various stages of development which the struggle of the working class 
against the bourgeoisie has to pass through, they always and everywhere represent 
the interests of the movement as a whole.
The Communists, therefore, are on the one hand, practically, 
the most advanced and resolute section of the working-class parties of every country,
that section which pushes forward all others; on the other hand, 
theoretically, they have over the great mass of the proletariat the 
advantage of clearly understanding the line of march, the conditions, 
and the ultimate general results of the proletarian movement."""

In [73]:
predict_category(marx)

this query most likely falls under: business software


Unnamed: 0,machine learning,business software
0,0.457771,0.542229


In [74]:
test = 'the quick brown fox jumped over the lazy dogs'

In [75]:
predict_category(test)

this query most likely falls under: business software


Unnamed: 0,machine learning,business software
0,0.458686,0.541314


In [76]:
neural_network = 'neural networks'
predict_category(neural_network)

this query most likely falls under: machine learning


Unnamed: 0,machine learning,business software
0,0.938165,0.061835


In [103]:
import requests
import re 

def cleaner(message):
    message = re.sub('\.+', ' ', message)
    message = re.sub('[^a-z0-9 ]','', message.lower())
    message = re.sub('\d+','NUMBER ',message)
    message = re.sub('\s+',' ',message)
    return message

def predict_by_url(url): 
    url = url.split("/")
    page = url[-1]
    wiki_query = \
    'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&titles={}'.format(page)
    response = requests.get(wiki_query)
    info = response.json()
    pageid = list(info['query']['pages'].keys())[0]
    text = info['query']['pages'][pageid]['extract']
    text = cleaner(text)
    text = pd.Series(text)
    vectorized = tfidf.transform(text)
    prediction = lr.predict(vectorized)
    probas = lr.predict_proba(vectorized)
    if prediction == 1:
        prediction = 'machine learning'
    else:
        prediction = 'business software'
    print ('''this url most likely falls under: {}'''.format(prediction))
    return pd.DataFrame(probas, columns = ['machine learning', 'business software'])

In [105]:
url = "https://en.wikipedia.org/wiki/Machine_learning"
predict_by_url(url)

this url most likely falls under: machine learning


Unnamed: 0,machine learning,business software
0,0.998942,0.001058
