<div style="text-align: right"><strong>Capstone #3:</strong> <span style="color:darkred">Supervised Learning</span> </div>

<a id="top"></a>

#### </span>__Part 1: Data Exploration__ https://github.com/kimrharper/thinkful/blob/master/unit3/unit3-capstone-exploration.ipynb </span><br><br><span>__Part 2: Analysis__ https://github.com/kimrharper/thinkful/blob/master/unit3/unit3-capstone-analysis.ipynb </span><br><br><span>__Part 3: Models__ https://github.com/kimrharper/thinkful/blob/master/unit3/unit3-capstone-models.ipynb </span>

----

# <span style="color:darkred">L1 Prediction from ELL Writing Samples</span>

### <span style="color:darkred">Part 3: </span><span style="color:darkblue">Models</span>

__Author:__ Ryan Harper 

----

<a href='#ov'>Overview</a><br>
<a href='#exp'>Experiment</a><br>
<a href='#sec1'>1. Models:</a><br>
><a href='#seca'>A. LR - Ordinary Least Squares</a><br>
<a href='#secb'>B. LR - Logistic Regression</a> <a href='#secb1'> (Lasso)</a> <a href='#secb2'> (Ridge)</a><br>
<a href='#secc'>C. NN - K Nearest Neighbors</a><br>
<a href='#secd'>D. NN - Naive Bayes</a><br>
<a href='#sece'>E. NN - Decision Tree</a><br>
<a href='#secf'>F. Ensemble - Random Forest</a><br>

<a href='#sec2'>2. Model Comparison</a><br>

<a id="sec1"></a>

# <span style="color:darkblue">1. Models:</span>  <a href='#top'>(top)</a>

In [1]:
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

import time

# Data processing
import pandas as pd
import plotly as plo
import seaborn as sns
from scipy import stats
from collections import Counter
import numpy as np
import itertools

# NLP
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib

# Stats
from sklearn.metrics import classification_report, roc_curve,roc_auc_score,accuracy_score
from sklearn import metrics

# Preparing Models
from sklearn.model_selection import train_test_split

# Decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

# Models
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB

# Ensemble
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

#Visualization
from IPython.display import Image
import pydotplus
import graphviz

# import altair as alt

__Import Features + Target__

In [2]:
loadf = False
if loadf:
    features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)
    del features['Unnamed: 0']
else:
    %store -r blog
    %store -r keepfeatures
    features = blog

lang = list(features.language.unique())

In [3]:
del features['id']
del features['content']
del features['pos']
del features['pos2']
del features['pos3']
del features['tokens']

In [4]:
features.language.value_counts()

Japanese               9536
Traditional Chinese    3407
Name: language, dtype: int64

__Set X,Y__ (UNUSED)

__Select Features:__ Features with biggest differences between L1s

In [5]:
selectfeatures = True
if selectfeatures:
    nonbinary = ['letters_per','wc','sc','sent_pol','sent_subj','cap_let','punc_count','freq_score','full_freq_score']
    best_features = keepfeatures + nonbinary
else:
    best_features = features.columns[~features.columns.str.contains('language')]

__Truncated SVD:__ Determine best features

In [6]:
svdbool = False
if svdbool:
    svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
    svd.fit(features[best_features])
    # print(svd.explained_variance_ratio_)
    # print(svd.explained_variance_ratio_.sum())  
    # print(svd.singular_values_)
    # svd.get_params
    # best_features = [X.columns[i] for i in svd.components_[0].argsort()[::-1]]

__Split Data to Train/Test__: Even Distribution Sampling

In [7]:
# evenly_distributed_test = [60 japanese,60 english, 60 chinese, 60 korean]
langlist = []
rs=21
for l in lang:
    print(l)
    langlist.append(features[features['language'] == l].sample(n=200, random_state=rs))

testset = langlist

Traditional Chinese
Japanese


In [8]:
test_data = pd.concat(testset)
y_test = test_data['language'].values.reshape(-1, 1).ravel()
X_test = test_data[best_features]

In [9]:
%%time
train_data = features[~features.isin(test_data)].dropna()
y_train = train_data['language'].values.reshape(-1, 1).ravel()
X_train = train_data[best_features]
# X_train = svd.transform(train_data[train_data.columns[~train_data.columns.str.contains('language')]])

CPU times: user 20.9 s, sys: 8.14 s, total: 29 s
Wall time: 28.6 s


<a id="seca"></a>

__Create Function for Comparing Models__

In [10]:
cols = ['name','time','total','precision','recall','f1']

model_set = pd.DataFrame(columns=cols)
models_stored = []
pattern = "%.2f"

In [11]:
def run_model(model,name):
    global model_set
    m = model
    m.fit(X_train, y_train)
    start = time.time()

    total_score = m.score(X_test,y_test)
    pscore = [pattern % i for i in list(metrics.precision_score(y_test, m.predict(X_test),labels=lang,average=None))]
    rscore = [pattern % i for i in list(metrics.recall_score(y_test, m.predict(X_test),labels=lang,average=None))]
    fscore = [pattern % i for i in list(metrics.f1_score(y_test, m.predict(X_test),labels=lang,average=None))]
    end = time.time()
    t= pattern % (end - start)

    r = dict(zip(cols,[name,t,total_score,pscore,rscore,fscore]))
    print('Check for Overfitting: {}\n'.format(m.score(X_train,y_train)))
    print('Test Score is: {}\n'.format(total_score))
    print(classification_report(y_test, m.predict(X_test)))
    
    model_set = model_set.append(r,ignore_index=True)
    return r,m

<a id="seca"></a>

### <span style="color:darkred">A. LR - Logistic Regression</span>  <a href='#top'>(top)</a>

> Target is binary so logistic regression will operate on probabilities

In [12]:
%%time
lreg_data,lreg = run_model(linear_model.LogisticRegression(),'Logistic Regression')

Check for Overfitting: 0.7455951526747987

Test Score is: 0.51

                     precision    recall  f1-score   support

           Japanese       0.51      1.00      0.67       200
Traditional Chinese       1.00      0.02      0.04       200

        avg / total       0.75      0.51      0.36       400

CPU times: user 324 ms, sys: 41.4 ms, total: 365 ms
Wall time: 348 ms


<a id="sece"></a>

### <span style="color:darkred">E. K Nearest Neighbors</span>  <a href='#top'>(top)</a>

> Can handle discrete values for target <br>Quantitative values are limited (not continuous) and might be problematic for nearest neighbors

In [13]:
%%time
neighbors_data,neighbors = run_model(KNeighborsClassifier(n_neighbors=8),'K Nearest Neighbor')

Check for Overfitting: 0.753647452762497

Test Score is: 0.5025

                     precision    recall  f1-score   support

           Japanese       0.50      0.97      0.66       200
Traditional Chinese       0.55      0.03      0.06       200

        avg / total       0.52      0.50      0.36       400

CPU times: user 2.26 s, sys: 51.1 ms, total: 2.32 s
Wall time: 2.22 s


<a id="secf"></a>

### <span style="color:darkred">F. Naive Bayes - Bernoulli</span>  <a href='#top'>(top)</a>

In [14]:
%%time
bnb_data,bnb = run_model(BernoulliNB(),'Naive Bayes - Bernoulli')

Check for Overfitting: 0.7950251136091844

Test Score is: 0.71

                     precision    recall  f1-score   support

           Japanese       0.66      0.87      0.75       200
Traditional Chinese       0.81      0.55      0.65       200

        avg / total       0.73      0.71      0.70       400

CPU times: user 235 ms, sys: 63.5 ms, total: 299 ms
Wall time: 300 ms


In [15]:
importance = dict(list(zip(X_train.columns,bnb.coef_[0])))
bnb_sorted = sorted(importance, key=importance.get, reverse=True)
for r in bnb_sorted[0:5]:
    print(r, importance[r])
print('')
%store bnb_sorted

letters_per -0.00031167212345195594
wc -0.00031167212345195594
sc -0.00031167212345195594
freq_score -0.00031167212345195594
full_freq_score -0.00031167212345195594

Stored 'bnb_sorted' (list)


<a id="secg"></a>

### <span style="color:darkred">G. Decision Tree</span>  <a href='#top'>(top)</a>

In [16]:
%%time
dt_data,dt = run_model(tree.DecisionTreeClassifier(criterion='entropy',max_depth=4),'Decision Tree')

Check for Overfitting: 0.7766084668739536

Test Score is: 0.565

                     precision    recall  f1-score   support

           Japanese       0.54      0.99      0.69       200
Traditional Chinese       0.93      0.14      0.24       200

        avg / total       0.73      0.56      0.47       400

CPU times: user 199 ms, sys: 16.7 ms, total: 216 ms
Wall time: 215 ms


In [17]:
# Render tree.
dot_data = tree.export_graphviz(
    dt, 
    out_file=None,
    feature_names=X_train.columns,
    label= 'root',
    proportion=False,
    rounded=True,
    class_names=lang,
    filled=True
)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

graph.write_png('decision_tree.png')

True

_Good visualization of important features and presentation of entropy weighting_

<a id="sech"></a>

### <span style="color:darkred">H. Random Forest</span>  <a href='#top'>(top)</a>

> Runs decision tree multiple times for best output <br>Longest processing time

In [18]:
%%time
rf_data,rf = run_model(ensemble.RandomForestClassifier(n_estimators=150,
                                                       criterion='entropy',
                                                       max_features=len(X_train.columns),
                                                       max_depth=6),'Random Forest')

Check for Overfitting: 0.815833532647692

Test Score is: 0.645

                     precision    recall  f1-score   support

           Japanese       0.59      0.98      0.74       200
Traditional Chinese       0.95      0.30      0.46       200

        avg / total       0.77      0.65      0.60       400

CPU times: user 24.4 s, sys: 85.8 ms, total: 24.5 s
Wall time: 24.6 s


In [19]:
rf.feature_importances_
importance = dict(list(zip(X_train.columns,rf.feature_importances_)))
rf_sorted = sorted(importance, key=importance.get, reverse=True)
for r in rf_sorted[0:5]:
    if importance[r] >0:
        print(r, importance[r])
print('')
%store rf_sorted

let2_ja 0.25818371468532586
sc 0.1766203065751496
wc 0.11116070145076383
cap_let 0.05450068918071202
adv_just 0.05124158545354391

Stored 'rf_sorted' (list)


<a id="sec2"></a>

# <span style="color:darkblue">2. Model Comparison</span>  <a href='#top'>(top)</a>

In [20]:
model_set.columns = ['name','time','total','prec: | JA | CH |','rec: | JA | CH |','f1: | JA | CH |']
print('Before TruncatedSVD')
model_set

Before TruncatedSVD


Unnamed: 0,name,time,total,prec: | JA | CH |,rec: | JA | CH |,f1: | JA | CH |
0,Logistic Regression,0.01,0.51,"[1.00, 0.51]","[0.02, 1.00]","[0.04, 0.67]"
1,K Nearest Neighbor,0.21,0.5025,"[0.55, 0.50]","[0.03, 0.97]","[0.06, 0.66]"
2,Naive Bayes - Bernoulli,0.01,0.71,"[0.81, 0.66]","[0.55, 0.87]","[0.65, 0.75]"
3,Decision Tree,0.01,0.565,"[0.93, 0.54]","[0.14, 0.99]","[0.24, 0.69]"
4,Random Forest,0.05,0.645,"[0.95, 0.59]","[0.30, 0.98]","[0.46, 0.74]"


In [21]:
model_save = model_set

-----