In [1]:
import os
os.chdir('../')

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import lib.database_module as db
from lib.general_model import make_data_dict

### Mine: Select document vectors for all pages from database

In [3]:
page_vectors = db.select_all_page_vectors()

Connected to server joshuacook.me.


In [4]:
vec_dict = {}
for page_id, vec in page_vectors:
    vec_dict[page_id] = vec

In [5]:
vec_df = pd.DataFrame.from_dict(vec_dict, orient='index')
vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
17645570,0.039274,0.055023,-0.014005,-0.01652,0.022691,-0.018724,0.002756,0.005556,-0.005737,-0.00903,...,-0.010056,-0.014171,-0.013905,0.003224,0.005246,0.0048,0.032152,0.011491,0.022782,0.003275
37019651,0.101253,0.161288,-0.224323,0.112926,-0.060645,0.066798,0.023622,0.008715,-0.017338,0.008191,...,-0.001198,-0.016804,-0.015773,-0.00929,-0.000526,0.004315,-0.003473,0.0033,-0.009625,-0.005633
9109512,0.039232,0.039217,0.008973,-0.047728,-0.015612,0.017161,0.000355,-0.002134,-0.002816,-0.015128,...,0.031067,0.01148,-0.02153,-0.004341,0.00419,0.044702,0.011483,-0.013307,-0.031249,-0.063539
45260809,0.18693,-0.060678,-0.004299,0.01209,0.001639,-0.007242,0.005464,-0.005203,0.001449,0.013998,...,0.019338,-0.001007,0.019923,0.030012,0.018544,-0.058307,-0.000886,-0.040026,-0.012019,0.010476
6565890,0.067058,0.092235,-0.103929,0.040591,-0.002851,0.008336,-0.013674,-5.2e-05,0.021902,-0.000659,...,0.00689,0.005918,-0.003518,-0.013995,-0.002671,0.020586,0.001678,-0.004499,0.003813,-0.016608


In [6]:
vec_df.reset_index()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,17645570,0.039274,0.055023,-0.014005,-0.016520,0.022691,-0.018724,0.002756,0.005556,-0.005737,...,-0.010056,-0.014171,-0.013905,0.003224,0.005246,0.004800,0.032152,0.011491,0.022782,0.003275
1,37019651,0.101253,0.161288,-0.224323,0.112926,-0.060645,0.066798,0.023622,0.008715,-0.017338,...,-0.001198,-0.016804,-0.015773,-0.009290,-0.000526,0.004315,-0.003473,0.003300,-0.009625,-0.005633
2,9109512,0.039232,0.039217,0.008973,-0.047728,-0.015612,0.017161,0.000355,-0.002134,-0.002816,...,0.031067,0.011480,-0.021530,-0.004341,0.004190,0.044702,0.011483,-0.013307,-0.031249,-0.063539
3,45260809,0.186930,-0.060678,-0.004299,0.012090,0.001639,-0.007242,0.005464,-0.005203,0.001449,...,0.019338,-0.001007,0.019923,0.030012,0.018544,-0.058307,-0.000886,-0.040026,-0.012019,0.010476
4,6565890,0.067058,0.092235,-0.103929,0.040591,-0.002851,0.008336,-0.013674,-0.000052,0.021902,...,0.006890,0.005918,-0.003518,-0.013995,-0.002671,0.020586,0.001678,-0.004499,0.003813,-0.016608
5,473278,0.047669,0.035399,-0.029411,0.008497,0.003388,-0.004835,0.002253,0.002067,0.002402,...,0.007909,0.003771,0.004351,-0.021422,-0.003424,0.005479,0.000647,0.008996,-0.014971,0.019701
6,48201744,0.021301,0.027559,-0.003917,-0.010877,0.013055,0.000934,0.000128,0.010317,0.001444,...,-0.055040,0.000110,0.057817,0.005156,0.058477,0.008807,0.043527,-0.011693,0.064738,0.051296
7,344083,0.274923,-0.104057,0.019691,0.014049,-0.003216,0.007568,0.006852,-0.001262,-0.004474,...,-0.010391,0.013268,-0.002170,-0.017240,0.012587,-0.005252,-0.004479,0.023658,-0.020673,-0.000385
8,3126617,0.052763,0.046094,0.026724,-0.122854,-0.102418,0.035661,0.027753,-0.014089,0.057140,...,-0.019954,0.018640,0.001549,0.010038,-0.009172,0.013997,0.011112,0.008547,-0.005977,0.013895
9,571480,0.073720,0.098731,-0.097566,0.035727,0.009638,-0.014480,0.016176,-0.001481,0.020922,...,-0.002490,-0.003406,0.001088,-0.002010,-0.003759,-0.003641,-0.002096,-0.006657,0.000193,0.003791


### Mine: Select category ids corresponding to pages from database

In [7]:
page_cats = db.execute_sql_statement('''SELECT * FROM page_cate''')

OK
Connected to server joshuacook.me.


In [8]:
page_cats = pd.DataFrame(page_cats, columns=['page_id','category_id'])

In [23]:
joined_df = pd.merge(vec_df.reset_index(), page_cats, left_on='index', right_on='page_id')
joined_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,492,493,494,495,496,497,498,499,page_id,category_id
0,17645570,0.039274,0.055023,-0.014005,-0.01652,0.022691,-0.018724,0.002756,0.005556,-0.005737,...,-0.013905,0.003224,0.005246,0.0048,0.032152,0.011491,0.022782,0.003275,17645570,956054
1,37019651,0.101253,0.161288,-0.224323,0.112926,-0.060645,0.066798,0.023622,0.008715,-0.017338,...,-0.015773,-0.00929,-0.000526,0.004315,-0.003473,0.0033,-0.009625,-0.005633,37019651,18726608
2,9109512,0.039232,0.039217,0.008973,-0.047728,-0.015612,0.017161,0.000355,-0.002134,-0.002816,...,-0.02153,-0.004341,0.00419,0.044702,0.011483,-0.013307,-0.031249,-0.063539,9109512,696445
3,45260809,0.18693,-0.060678,-0.004299,0.01209,0.001639,-0.007242,0.005464,-0.005203,0.001449,...,0.019923,0.030012,0.018544,-0.058307,-0.000886,-0.040026,-0.012019,0.010476,45260809,695196
4,6565890,0.067058,0.092235,-0.103929,0.040591,-0.002851,0.008336,-0.013674,-5.2e-05,0.021902,...,-0.003518,-0.013995,-0.002671,0.020586,0.001678,-0.004499,0.003813,-0.016608,6565890,18726608


### Refine: Create a data dictionary with training and testing sets

In [10]:
y = joined_df['category_id']

In [11]:
X = joined_df.drop(['page_id', 'category_id', 'index'], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

data_dict = { 'X': X,
              'y': y,
              'X_train': X_train,
              'y_train': y_train,
              'X_test': X_test,
              'y_test': y_test,
            }

### Model: Fit, Score, and Tune a multi-label classification model

In [13]:
gnb = GaussianNB()
gnb.fit(data_dict['X_train'], data_dict['y_train'])

GaussianNB(priors=None)

In [14]:
train_score = gnb.score(data_dict['X_train'], data_dict['y_train'])

In [15]:
test_score = gnb.score(data_dict['X_test'], data_dict['y_test'])

In [16]:
stand_scale = StandardScaler()
stand_scale.fit(X_train)
X_train = stand_scale.transform(X_train)
X_test = stand_scale.transform(X_test)

In [17]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
print dec_tree.score(X_train, y_train)
print dec_tree.score(X_test, y_test)

0.961989938513
0.77554438861


In [18]:
rand_for = RandomForestClassifier()
rand_for.fit(X_train, y_train)
print rand_for.score(X_train, y_train) 
print rand_for.score(X_test, y_test)

0.961430967021
0.797319932998


In [19]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print knn.score(X_train, y_train) 
print knn.score(X_test, y_test)

0.832308552264
0.695142378559


### Present: Pickle tuned model for later use	

In [20]:
from sklearn.externals import joblib

In [21]:
joblib.dump(rand_for, "RFC.pkl")

from_pkl_cls = joblib.load("RFC.pkl")

In [22]:
type(from_pkl_cls)

sklearn.ensemble.forest.RandomForestClassifier

In [25]:
from_pkl_cls.score(X_train, y_train)

0.96143096702068198