In [71]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
import matplotlib.cm as cm
import sklearn
from sklearn.lda import LDA
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR, SVC
from sklearn.linear_model import LinearRegression, LogisticRegression , SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize    

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 15 seconds


In [2]:

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
 
from build_db import Category, Base, WebPage
engine = create_engine('sqlite:///features.db', encoding='utf8', convert_unicode=True)

Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()


In [7]:
statement = session.query(WebPage).statement
print(statement.compile(engine))

SELECT web_pages.id, web_pages.title, web_pages.url, web_pages.text, web_pages.num_divs, web_pages.num_titles, web_pages.num_refs, web_pages.cat_id 
FROM web_pages


In [9]:
dataframe = pd.read_sql(statement.compile(engine), session.query(WebPage).session.bind)

In [13]:
dataframe.shape

(6445, 8)

In [19]:
dataframe.head()

Unnamed: 0,id,title,url,text,num_divs,num_titles,num_refs,cat_id
0,1,68 Dean Street,http://www.sixty8.com/,,0,1,4,Art
1,2,Abandoned Communities,http://www.abandonedcommunities.co.uk/,&#13; &#13; &#13; &#13; &#13; &#13; Since the ...,38,1,16,Art
2,3,Alexander Thomson Society,http://www.greekthomson.com/,&#13; &#13; &#13; &#13; &#13; Questions? We’re...,59,1,105,Art
3,4,"Arab British Centre, The",http://www.arabbritishcentre.org.uk/,,0,0,0,Art
4,5,Architectural Association School of Architecture,http://www.aaschool.ac.uk/,&#13; &#13; &#13; &#13; &#13; &#13; &#13; &#13...,111,1,116,Art


In [64]:
y = dataframe['cat_id']
X = pd.DataFrame(dataframe['text'])

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [65]:
le = LabelEncoder()
le.fit(y)

LabelEncoder()

In [66]:
y_num = pd.Series(le.transform(y))

In [67]:
count_vect = CountVectorizer()

In [68]:
X_train_counts = count_vect.fit_transform(X['text'])

In [28]:
X_train_counts.shape

(6445, 98136)

In [31]:
count_vect.vocabulary_.get(u'sports')

75942

In [35]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6445, 98136)

In [52]:
clf = MultinomialNB().fit(X_train_tfidf, y_num)

In [53]:
docs_new = ['Programming, algorithm', 'Soccer player']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [62]:
scores =  cross_val_score(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=15, random_state=42), X_train_tfidf, y_num, cv=2)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.66294574  0.6257764 ]  mean: 0.644361  std: 0.018585


In [63]:
scores =  cross_val_score(RandomForestClassifier(), X_train_tfidf, y_num, cv=2)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(scores)))

scores: [ 0.64682171  0.60217391]  mean: 0.624498  std: 0.022324
