In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
import matplotlib.cm as cm
import sklearn
from sklearn.lda import LDA
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR, SVC
from sklearn.linear_model import LinearRegression, LogisticRegression , SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize    
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score

Autosaving every 15 seconds


In [3]:

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
 
from build_db import Category, Base, WebPage
engine = create_engine('sqlite:///features.db', encoding='utf8', convert_unicode=True)

Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()


In [4]:
statement = session.query(WebPage).statement
print(statement.compile(engine))

SELECT web_pages.id, web_pages.title, web_pages.url, web_pages.text, web_pages.num_divs, web_pages.num_titles, web_pages.num_refs, web_pages.cat_id 
FROM web_pages


In [5]:
dataframe = pd.read_sql(statement.compile(engine), session.query(WebPage).session.bind)

In [6]:
dataframe.shape

(6662, 8)

In [112]:
dataframe.head()

Unnamed: 0,id,title,url,text,num_divs,num_titles,num_refs,cat_id
0,1,68 Dean Street,http://www.sixty8.com/,,0,1,4,Art
1,2,Abandoned Communities,http://www.abandonedcommunities.co.uk/,&#13; &#13; &#13; &#13; &#13; &#13; Since the ...,38,1,16,Art
2,3,Alexander Thomson Society,http://www.greekthomson.com/,&#13; &#13; &#13; &#13; &#13; Questions? We’re...,59,1,105,Art
3,4,"Arab British Centre, The",http://www.arabbritishcentre.org.uk/,,0,0,0,Art
4,5,Architectural Association School of Architecture,http://www.aaschool.ac.uk/,&#13; &#13; &#13; &#13; &#13; &#13; &#13; &#13...,111,1,116,Art


In [7]:
y = dataframe['cat_id']
X = pd.DataFrame(dataframe['text'])

In [8]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [9]:
y_num = pd.Series(le.transform(y))

In [10]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)

In [11]:
# pipeline con todo el proceso: vectorización, ajuste de pesos y modelo
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',model),])

In [12]:
# cross val usando el pipeline
scores =  cross_val_score(text_clf, X['text'], y_num, cv=4)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.6124775   0.66326531  0.65486194  0.55742634]  mean: 0.622008  std: 0.041962


In [17]:
X_title = pd.DataFrame(dataframe['title'])
X_title.head()

Unnamed: 0,title
0,68 Dean Street
1,Abandoned Communities
2,Alexander Thomson Society
3,"Arab British Centre, The"
4,Architectural Association School of Architecture


In [13]:
model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)

In [14]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',MultinomialNB(fit_prior=True)),])

In [18]:
# cross val usando el pipeline
scores =  cross_val_score(text_clf, X_title['title'], y_num, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.57709581  0.62565641  0.63315829  0.60601504  0.53609023]  mean: 0.595603  std: 0.035514


In [19]:
X_title.head()

Unnamed: 0,title
0,68 Dean Street
1,Abandoned Communities
2,Alexander Thomson Society
3,"Arab British Centre, The"
4,Architectural Association School of Architecture


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_title, y_num, test_size=0.33, random_state=42)

In [21]:
text_clf.fit(X_train['title'], y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [22]:
predicted = text_clf.predict(X_test['title'])
np.mean(predicted == y_test) 

0.62164620281946337

In [23]:
confusion_matrix(y_test, predicted)

array([[1154,    8,    2,    0,    0,    0],
       [ 295,  105,    9,    0,    0,    0],
       [ 209,    4,  108,    0,    0,    0],
       [   8,    0,    0,    0,    0,    0],
       [  89,    0,    1,    0,    0,    0],
       [ 201,    4,    2,    0,    0,    0]])

In [24]:
for i in range(6):
    print le.inverse_transform(i)
    print len(y_num[y_num==i])

Art
3483
Business
1263
Education
971
Lifestyle
21
Sports
248
Tech
676
