In [32]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
import matplotlib.cm as cm
import sklearn
from sklearn.lda import LDA
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.linear_model import LinearRegression, LogisticRegression , SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize    
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 15 seconds


In [2]:

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
 
from build_db import Category, Base, WebPage
engine = create_engine('sqlite:///features.db', encoding='utf8', convert_unicode=True)

Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()


In [3]:
statement = session.query(WebPage).statement
print(statement.compile(engine))

SELECT web_pages.id, web_pages.title, web_pages.url, web_pages.text, web_pages.num_divs, web_pages.num_titles, web_pages.num_refs, web_pages.cat_id 
FROM web_pages


In [4]:
dataframe = pd.read_sql(statement.compile(engine), session.query(WebPage).session.bind)

In [5]:
dataframe.shape

(4480, 8)

In [6]:
dataframe.head()

Unnamed: 0,id,title,url,text,num_divs,num_titles,num_refs,cat_id
0,159,2Cs Communications,http://www.2cs.com/,,0,0,0,Art
1,160,2idesign Ltd,http://www.2idesign.co.uk/,About the Agency 2idesign is an established Gr...,50,1,84,Art
2,161,7 Acts of Love,http://www.7actsoflove.org/,7 acts of loveSay Cheese is artist Oreet Asher...,10,1,3,Art
3,162,A2 Arts: South East London Contemporary Artists,http://www.a2arts.co.uk/,Information A2 Arts is managed and supported b...,7,1,24,Art
4,163,"ADAM : Art, Design, Architecture and Media Inf...",http://www.adam.ac.uk/,,0,0,0,Art


In [7]:
y = dataframe['cat_id']
X = pd.DataFrame(dataframe['text'])

In [8]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [9]:
y_num = pd.Series(le.transform(y))

In [10]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)

In [41]:
# pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf',model),])

In [12]:
# cross val usando el pipeline
scores =  cross_val_score(text_clf, X['text'], y_num, cv=4)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.56951872  0.61016949  0.65744871  0.46774194]  mean: 0.576220  std: 0.069934


In [25]:
model = RandomForestClassifier(n_estimators=80)
# pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',model),])

In [43]:
def classify(X, y_num, model):
    # pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer()), ('clf',model),])
    scores =  cross_val_score(text_clf, X['text'], y_num, cv=2)
    print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))
    

In [46]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=200, random_state=12)
classify(X, y_num, model)

scores: [ 0.57111012  0.540903  ]  mean: 0.556007  std: 0.015104


In [51]:
def classify2(X, y_num, model):
    # pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), strip_accents='unicode')), 
                         ('tfidf', TfidfTransformer(sublinear_tf=True)), 
                         ('clf',model),])
    scores =  cross_val_score(text_clf, X['text'], y_num, cv=2)
    print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))
    

In [54]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=20, random_state=12)
classify2(X, y_num, model)

scores: [ 0.57556844  0.54045597]  mean: 0.558012  std: 0.017556


In [55]:
X_title = pd.DataFrame(dataframe['title'])
X_title.head()

Unnamed: 0,title
0,2Cs Communications
1,2idesign Ltd
2,7 Acts of Love
3,A2 Arts: South East London Contemporary Artists
4,"ADAM : Art, Design, Architecture and Media Inf..."


In [69]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), strip_accents='unicode')), 
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),  
                     ('clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=20, random_state=12)),])

In [62]:
# cross val usando el pipeline
scores =  cross_val_score(text_clf, X_title['title'], y_num, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.5083426   0.53170189  0.59106145  0.53243848  0.39305711]  mean: 0.511320  std: 0.065138


In [63]:
X_title.head()

Unnamed: 0,title
0,2Cs Communications
1,2idesign Ltd
2,7 Acts of Love
3,A2 Arts: South East London Contemporary Artists
4,"ADAM : Art, Design, Architecture and Media Inf..."


In [75]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X['text']), y_num, test_size=0.33, random_state=42)

In [77]:
text_clf.fit(X_train['text'], y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        st...     penalty='l2', power_t=0.5, random_state=12, shuffle=True, verbose=0,
       warm_start=False))])

In [79]:
predicted = text_clf.predict(X_test['text'])
np.mean(predicted == y_test) 

0.62474645030425968

In [80]:
confusion_matrix(y_test, predicted)

array([[282,  40,  36,   1,   6,  11],
       [ 80, 307,  30,   0,   3,   7],
       [ 47,  21, 245,   0,   1,   5],
       [  3,   3,   0,   0,   0,   0],
       [ 32,  37,  14,   0,  49,  10],
       [ 71,  70,  27,   0,   0,  41]])

In [81]:
for i in range(6):
    print le.inverse_transform(i)
    print len(y_num[y_num==i])

Art
1123
Business
1257
Education
967
Lifestyle
19
Sports
447
Tech
667
