In [40]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import string

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
import matplotlib.cm as cm
import sklearn
from sklearn.lda import LDA
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.linear_model import LinearRegression, LogisticRegression , SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize    
from nltk.stem.porter import PorterStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 15 seconds


In [26]:

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
 
from build_db import Category, Base, WebPage
engine = create_engine('sqlite:///features.db', encoding='utf8', convert_unicode=True)

Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()


In [3]:
statement = session.query(WebPage).statement
dataframe = pd.read_sql(statement.compile(engine), session.query(WebPage).session.bind)

In [4]:
dataframe.shape

(4480, 8)

In [5]:
y = dataframe['cat_id']
X = pd.DataFrame(dataframe['text'])

In [6]:
# transform label to numerical values
le = LabelEncoder()
le.fit(y)
y_num = pd.Series(le.transform(y))

In [74]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [28]:
vect = CountVectorizer(tokenizer=tokenize, stop_words='english') 

In [33]:
def classify2(X, y_num, model):
    # pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
    text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, strip_accents='unicode', stop_words='english')), 
                         ('tfidf', TfidfTransformer(sublinear_tf=True)), 
                         ('clf',model),])
    scores =  cross_val_score(text_clf, X['text'], y_num, cv=2)
    print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

In [34]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=10, random_state=12)
classify2(X, y_num, model)

scores: [ 0.5849309   0.55654895]  mean: 0.570740  std: 0.014191


In [47]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X['text']), y_num, test_size=0.15, random_state=42)

In [75]:
text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, strip_accents='unicode', stop_words='english')), 
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),  
                     ('clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=20, random_state=12)),])

In [76]:
text_clf.fit(X_train['text'], y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
     ...     penalty='l2', power_t=0.5, random_state=12, shuffle=True, verbose=0,
       warm_start=False))])

In [77]:
predicted = text_clf.predict(X_test['text'])

In [78]:
confusion_matrix(y_test, predicted)

array([[149,  20,  13,   0,   2,   3],
       [ 32, 120,  11,   0,   4,   4],
       [ 27,   7, 115,   0,   2,   1],
       [  2,   1,   0,   0,   0,   0],
       [ 14,  11,   5,   0,  31,   4],
       [ 33,  29,   9,   0,   4,  19]])

In [79]:
print classification_report(y_test, predicted)


             precision    recall  f1-score   support

          0       0.58      0.80      0.67       187
          1       0.64      0.70      0.67       171
          2       0.75      0.76      0.75       152
          3       0.00      0.00      0.00         3
          4       0.72      0.48      0.57        65
          5       0.61      0.20      0.30        94

avg / total       0.65      0.65      0.63       672



In [73]:
for i in range(6):
    print le.inverse_transform(i)
    print len(y_num[y_num==i])

Art
1123
Business
1257
Education
967
Lifestyle
19
Sports
447
Tech
667
