In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import string
import pickle

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt


import matplotlib.cm as cm
import sklearn
from sklearn.decomposition import TruncatedSVD 
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.linear_model import LinearRegression, LogisticRegression , SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize    
from nltk.stem.porter import PorterStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score

Autosaving every 15 seconds


In [2]:

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
 
from build_db import Category, Base, WebPage
engine = create_engine('sqlite:///features.db', encoding='utf8', convert_unicode=True)

Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()


In [3]:
statement = session.query(WebPage).statement
dataframe = pd.read_sql(statement.compile(engine), session.query(WebPage).session.bind)

In [4]:
dataframe.shape

(4480, 8)

In [5]:
y = dataframe['cat_id']
X = pd.DataFrame(dataframe['text'])

In [6]:
# transform label to numerical values
le = LabelEncoder()
le.fit(y)
y_num = pd.Series(le.transform(y))

In [7]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [8]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X['text']), y_num, test_size=0.15, random_state=42)

In [10]:
# pipeline using singular value decomposition
clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, strip_accents='unicode', stop_words='english')), 
                ('tfidf', TfidfTransformer(sublinear_tf=True)),  
                ('svd', TruncatedSVD()),
                ('clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=20, random_state=12)),])

In [35]:
clf.fit(X_train['text'], y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...     penalty='l2', power_t=0.5, random_state=12, shuffle=True, verbose=0,
       warm_start=False))])

In [36]:
predicted = clf.predict(X_test['text'])

In [48]:
n_components = [300, 400, 500, 700, 1000]

In [None]:
estimator = GridSearchCV(clf, dict(svd__n_components=n_components), verbose=True)
estimator.fit(X['text'], y_num)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 30.0min finished


In [None]:
predicted = estimator.predict(X_test['text'])

In [46]:
estimator.best_score_

0.54821428571428577

In [43]:
confusion_matrix(y_test, predicted)

array([[ 99,  64,  18,   0,   2,   4],
       [  7, 142,  11,   0,   4,   7],
       [  8,  20, 124,   0,   0,   0],
       [  1,   2,   0,   0,   0,   0],
       [  7,  14,   7,   0,  35,   2],
       [ 16,  43,  13,   0,   1,  21]])

In [44]:
print classification_report(y_test, predicted)


             precision    recall  f1-score   support

          0       0.72      0.53      0.61       187
          1       0.50      0.83      0.62       171
          2       0.72      0.82      0.76       152
          3       0.00      0.00      0.00         3
          4       0.83      0.54      0.65        65
          5       0.62      0.22      0.33        94

avg / total       0.66      0.63      0.61       672



In [11]:
n_components = [600,700,800,900]

In [12]:
estimator = GridSearchCV(clf, dict(svd__n_components=n_components), verbose=100)
estimator.fit(X['text'], y_num)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] svd__n_components=600 ...........................................
[CV] .................. svd__n_components=600, score=0.537433 - 1.1min
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  1.1min
[CV] svd__n_components=600 ...........................................
[CV] .................. svd__n_components=600, score=0.605228 - 1.2min
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:  2.3min
[CV] svd__n_components=600 ...........................................
[CV] .................. svd__n_components=600, score=0.503351 - 1.1min
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:  3.4min
[CV] svd__n_components=700 ...........................................
[CV] .................. svd__n_components=700, score=0.536765 - 1.4min
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:  4.9min
[CV] svd__n_components=700 ...........................................
[CV] .................. svd__n_components=700, score=0.60

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...     penalty='l2', power_t=0.5, random_state=12, shuffle=True, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'svd__n_components': [600, 700, 800, 900]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=100)

In [14]:
estimator.best_params_

{'svd__n_components': 900}

In [17]:
predicted = estimator.predict(X_test['text'])

In [19]:
confusion_matrix(y_test, predicted)

array([[102,  66,  15,   0,   2,   2],
       [  6, 150,  11,   0,   2,   2],
       [  5,  19, 127,   0,   0,   1],
       [  1,   1,   0,   1,   0,   0],
       [  6,  15,   6,   0,  36,   2],
       [ 13,  38,  11,   0,   2,  30]])

In [18]:
print classification_report(y_test, predicted)


             precision    recall  f1-score   support

          0       0.77      0.55      0.64       187
          1       0.52      0.88      0.65       171
          2       0.75      0.84      0.79       152
          3       1.00      0.33      0.50         3
          4       0.86      0.55      0.67        65
          5       0.81      0.32      0.46        94

avg / total       0.72      0.66      0.65       672



In [73]:
for i in range(6):
    print le.inverse_transform(i)
    print len(y_num[y_num==i])

Art
1123
Business
1257
Education
967
Lifestyle
19
Sports
447
Tech
667


In [49]:
# pipeline using singular value decomposition
clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, strip_accents='unicode', stop_words='english')), 
                ('tfidf', TfidfTransformer(sublinear_tf=True)),  
                ('svd', TruncatedSVD(n_components=900)),
                ('clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=20, random_state=12)),])

In [12]:
clf.fit(X_train['text'], y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...     penalty='l2', power_t=0.5, random_state=12, shuffle=True, verbose=0,
       warm_start=False))])

In [13]:
predicted = clf.predict(X_test['text'])
print classification_report(y_test, predicted)


             precision    recall  f1-score   support

          0       0.64      0.55      0.59       187
          1       0.47      0.80      0.60       171
          2       0.76      0.75      0.75       152
          3       0.00      0.00      0.00         3
          4       0.78      0.48      0.59        65
          5       0.66      0.22      0.33        94

avg / total       0.64      0.60      0.59       672



  'precision', 'predicted', average, warn_for)


In [9]:
from sklearn.externals import joblib

In [19]:
joblib.dump(clf,"serialized/mymodel.pkl")

['serialized/mymodel.pkl',
 'serialized/mymodel.pkl_01.npy',
 'serialized/mymodel.pkl_02.npy',
 'serialized/mymodel.pkl_03.npy',
 'serialized/mymodel.pkl_04.npy',
 'serialized/mymodel.pkl_05.npy',
 'serialized/mymodel.pkl_06.npy',
 'serialized/mymodel.pkl_07.npy',
 'serialized/mymodel.pkl_08.npy',
 'serialized/mymodel.pkl_09.npy']

In [10]:
estimator = joblib.load("serialized/mymodel.pkl")

In [11]:
predicted = estimator.predict(X_test['text'])
print classification_report(y_test, predicted)

             precision    recall  f1-score   support

          0       0.64      0.55      0.59       187
          1       0.47      0.80      0.60       171
          2       0.76      0.75      0.75       152
          3       0.00      0.00      0.00         3
          4       0.78      0.48      0.59        65
          5       0.66      0.22      0.33        94

avg / total       0.64      0.60      0.59       672



  'precision', 'predicted', average, warn_for)


In [29]:
le.inverse_transform(estimator.predict(['Computers ipod apple iphone']))

array([u'Tech'], dtype=object)

In [73]:
pd.Series(['algorithms math', 'aasd', 'aaaa'])

0    algorithms math
1               aasd
2               aaaa
dtype: object

In [26]:
for i in range(6):
    print le.inverse_transform(i)
    print len(y_num[y_num==i])

Art
1123
Business
1257
Education
967
Lifestyle
19
Sports
447
Tech
667
