<div style="text-align: right"><strong>Capstone #3:</strong> <span style="color:darkred">Supervised Learning</span> </div>

<a id="top"></a>

#### <span style="color:darkred">__Part 1: Data Exploration__ https://github.com/kimrharper/thinkful/blob/master/unit3/unit3-capstone-exploration.ipynb </span><br><br><span style="color:darkred">__Part 2: Models__ https://github.com/kimrharper/thinkful/blob/master/unit3/unit3-capstone-models.ipynb </span>

----

# <span style="color:darkred">Part 2: </span><span style="color:darkblue">L1 Prediction from ELL Writing Samples</span>

__Author:__ Ryan Harper 

----

<a href='#ov'>Overview</a><br>
<a href='#exp'>Experiment</a><br>
<a href='#sec1'>1. Models:</a><br>
><a href='#seca'>A. LR - Ordinary Least Squares</a><br>
<a href='#secb'>B. LR - Logistic Regression</a> <a href='#secb1'> (Lasso)</a> <a href='#secb2'> (Ridge)</a><br>
<a href='#secc'>C. NN - K Nearest Neighbors</a><br>
<a href='#secd'>D. NN - Naive Bayes</a><br>
<a href='#sece'>E. NN - Decision Tree</a><br>
<a href='#secf'>F. Ensemble - Random Forest</a><br>

<a href='#sec2'>2. Model Comparison</a><br>

<a id="ov"></a>

<a id="sec1"></a>

# <span style="color:darkblue">1. Models:</span>  <a href='#top'>(top)</a>

In [62]:
# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

import time

# Data processing
import pandas as pd
import plotly as plo
import seaborn as sns
from scipy import stats
from collections import Counter
import numpy as np
import itertools

# NLP
from nltk.corpus import stopwords as sw
from nltk.util import ngrams
from nltk.corpus import brown
import nltk
import re
from nltk.tokenize import RegexpTokenizer
import difflib

# Stats
from sklearn.metrics import classification_report, roc_curve,roc_auc_score,accuracy_score
from sklearn import metrics

# Preparing Models
from sklearn.model_selection import train_test_split

# Decomposition
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

# Models
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB

# Ensemble
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

#Visualization
from IPython.display import Image
import pydotplus
import graphviz

# import altair as alt

__Import Features + Target__

In [183]:
features = pd.read_csv('blogfeatures.csv').sample(frac=1.0)

In [184]:
del features['Unnamed: 0']
del features['id']
del features['content']
del features['pos']
del features['pos2']
del features['pos3']
del features['tokens']

In [185]:
features.language.value_counts()

Japanese               9536
Traditional Chinese    3407
Name: language, dtype: int64

__Declare X,y Variables__

###  <span style="color:darkblue">C. Statistical Significance <a href='#top'>(top)</a>

In [186]:
from scipy.stats import median_test,mannwhitneyu,f_oneway
def mw_test(a,b):
    stat,p = mannwhitneyu(a,b, use_continuity=True, alternative=None)
    return stat,p

def moods_median_test(vals):
    stat, p, med, tbl = median_test(*vals)
    return stat,p

def f1way_test(a,b,c,d):
    f,b = f_oneway(a,b,c,d)
    return f,b

import warnings
warnings.filterwarnings('ignore')

__B. Mood’s Median test (2+ Non-Normally Distributed Independent Samples)__

In [187]:
lang = list(features.language.unique())
print(lang)

['Traditional Chinese', 'Japanese']


In [188]:
moodslist = {}

for c in features.columns: 
    if c not in exclude: 
        g = [(features[c][features.language == l]) for l in lang]
        stat,p = moods_median_test(g)
        vals = 'stat={}, p={}'.format(stat,p)
        if p < .05:
            moodslist[c] = p
        else:
            pass

__SVD Truncate: To find most important features__

In [189]:
svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=7,
       random_state=42, tol=0.0)

In [123]:
best_features = list(moodslist.keys())

In [122]:
best_features = ['pos3_JJ-FW-FW', 'sc', 'pos3_DT-JJS-VBZ', 'sent_subj', 'punc_count', 'pos3_NN-POS-WRB', 'wc', 'pos3_JJR-JJ-VBZ', 'prep_like', 'pos3_PRP$-VBG-CC', 'pos3_JJ-PRP-DT', 'pos3_NNS-WRB-JJ', 'pos3_PDT-DT-NN', 'pos3_VBD-CD-WRB', 'pos3_PRP-FW-PRP', 'pos3_NNP-VBD-JJ', 'pos3_RB-PRP$-MD', 'pos3_VBZ-JJR-JJ', 'pos3_JJ-TO-PRP', 'letters_per', 'prep_across', 'prp_he', 'pos3_JJ-CD-JJ', 'pos3_VB-VBG-NNS', 'pos3_VBZ-VBG-JJR', 'pos3_VBZ-PRP$-IN', 'sent_pol', 'pos3_RB-VBG-VB', 'pos3_NN-VBD-MD', 'full_freq_score', 'cap_let', 'freq_score', 'pos3_VB-RB-MD', 'pos3_PRP-VBP-VBD', 'pos3_NNP-NNS-NNP', 'pos3_NN-UH-VB', 'pos3_PRP$-NN-POS', 'pos3_PRP-IN-JJS', 'pos3_PRP$-RP-IN', 'prep_while', 'pos3_TO-CC-WRB', 'pos3_NN-CC-VBD', 'pos3_VBD-CD-JJ', 'pos3_RB-PRP$-CD', 'pos3_VB-PRP-WRB', 'pos3_PRP-RBR-PRP', 'pos3_VB-NNP-MD', 'pos3_VB-JJ-SYM', 'pos3_VBZ-VB-DT', 'pos3_NNP-NN-RP', 'pos3_CD-NN-RB', 'pos3_RP-NN-RB', 'pos3_VBG-NNS-RP', 'pos3_WDT-PRP-VBD', 'pos3_RP-RB-TO', 'pos3_WRB-PRP-NNS', 'pos3_JJS-JJ-VBP', 'pos3_VBZ-JJR-DT', 'pos3_RB-NNS-VBD', 'pos3_CD-JJR-CD', 'pos3_NNPS-VBN-IN', 'pos3_VBZ-CC-VBG', 'pos3_WRB-CC-VB', 'pos3_NNS-RB-RB', 'pos3_JJ-POS-MD', 'pos3_RBR-CC-VB', 'pos3_CC-JJ-VBD', 'pos3_MD-IN-DT', 'prep_between', 'pos3_CC-CC-PRP', 'pos3_VBP-NN-VBN', 'pos3_PRP-CD-JJ', 'pos3_RP-NNS-WP', 'pos3_RB-MD-RBR', 'pos3_JJ-WRB-PRP$', 'pos3_PRP-WP-PRP', 'pos3_JJ-RB-VBZ', 'pos3_NNS-DT-VBP', 'pos3_VB-VB-CC', 'pos3_VBN-VBD-TO', 'pos3_RP-RP-NN', 'pos3_RB-WP-NN', 'pos3_VBN-EX-VBZ', 'pos3_VB-TO-RBR', 'pos3_JJS-DT-VBZ', 'pos3_RB-IN-PRP$', 'pos3_CC-WDT-MD', 'pos3_RB-JJ-PDT', 'pos3_VBN-PRP-RP', 'pos3_VBZ-NNP-DT', 'pos3_CC-VBN-RB', 'pos3_VBP-NNS-NNS', 'pos3_VBN-PRP$-JJS', 'pos3_JJ-CD-CC', 'pos3_CC-VB-JJR', 'pos3_VBP-POS-DT', 'pos3_SYM-CD-NNP', 'cc_et', 'pos3_RP-RB-PRP', 'pos3_JJ-TO-PRP$']

In [190]:
best_features = features.columns[~features.columns.str.contains('language')]

__Reduce Features (Unused): 14,665 to n_components __

__Split Data to Train/Test__

_Even Distribution Sampling_

In [191]:
# evenly_distributed_test = [60 japanese,60 english, 60 chinese, 60 korean]
langlist = []
rs=53
for l in lang:
    print(l)
    langlist.append(features[features['language'] == l].sample(n=300, random_state=rs))

testset = langlist

Traditional Chinese
Japanese


In [204]:
test_data = pd.concat(testset)
y_test = test_data['language'].values.reshape(-1, 1).ravel()
X_test = test_data[best_features]

In [205]:
%%time
train_data = features[~features.isin(test_data)].dropna()
y_train = train_data['language'].values.reshape(-1, 1).ravel()
X_train = train_data[best_features]
# X_train = svd.transform(train_data[train_data.columns[~train_data.columns.str.contains('language')]])

CPU times: user 21.4 s, sys: 10.3 s, total: 31.7 s
Wall time: 32.5 s


<a id="seca"></a>

__Create Function for Comparing Models__

In [206]:
cols = ['name','time','total','precision','recall','f1']

model_set = pd.DataFrame(columns=cols)
models_stored = []
pattern = "%.2f"

In [207]:
def run_model(model,name):
    global model_set
    m = model
    m.fit(X_train, y_train)
    start = time.time()

    total_score = m.score(X_test,y_test)
    pscore = [pattern % i for i in list(metrics.precision_score(y_test, m.predict(X_test),labels=lang,average=None))]
    rscore = [pattern % i for i in list(metrics.recall_score(y_test, m.predict(X_test),labels=lang,average=None))]
    fscore = [pattern % i for i in list(metrics.f1_score(y_test, m.predict(X_test),labels=lang,average=None))]
    end = time.time()
    t= pattern % (end - start)

    r = dict(zip(cols,[name,t,total_score,pscore,rscore,fscore]))
    print('Check for Overfitting: {}\n'.format(m.score(X_train,y_train)))
    print('Test Score is: {}\n'.format(total_score))
    print(classification_report(y_test, m.predict(X_test)))
    
    model_set = model_set.append(r,ignore_index=True)
    return r,m

<a id="seca"></a>

### <span style="color:darkred">A. LR - Logistic Regression</span>  <a href='#top'>(top)</a>

> Target is binary so logistic regression will operate on probabilities

In [208]:
%%time
lreg_data,lreg = run_model(linear_model.LogisticRegression(),'Logistic Regression')

Check for Overfitting: 0.8164951794539415

Test Score is: 0.655

                     precision    recall  f1-score   support

           Japanese       0.59      0.98      0.74       300
Traditional Chinese       0.93      0.33      0.49       300

        avg / total       0.76      0.66      0.62       600

CPU times: user 6.41 s, sys: 1.23 s, total: 7.65 s
Wall time: 5.97 s


<a id="secb1"></a>

<a id="sece"></a>

### <span style="color:darkred">E. K Nearest Neighbors</span>  <a href='#top'>(top)</a>

> Can handle discrete values for target <br>Quantitative values are limited (not continuous) and might be problematic for nearest neighbors

In [209]:
%%time
neighbors_data,neighbors = run_model(KNeighborsClassifier(n_neighbors=2),'K Nearest Neighbor')

Check for Overfitting: 0.7531394312565827

Test Score is: 0.49666666666666665

                     precision    recall  f1-score   support

           Japanese       0.50      0.98      0.66       300
Traditional Chinese       0.42      0.02      0.03       300

        avg / total       0.46      0.50      0.35       600

CPU times: user 1min 7s, sys: 3.75 s, total: 1min 11s
Wall time: 1min 16s


<a id="secf"></a>

### <span style="color:darkred">F. Naive Bayes - Bernoulli</span>  <a href='#top'>(top)</a>

In [213]:
%%time
bnb_data,bnb = run_model(BernoulliNB(),'Naive Bayes - Bernoulli')

Check for Overfitting: 0.7416349347808474

Test Score is: 0.6433333333333333

                     precision    recall  f1-score   support

           Japanese       0.61      0.82      0.70       300
Traditional Chinese       0.72      0.47      0.57       300

        avg / total       0.66      0.64      0.63       600

CPU times: user 13.2 s, sys: 4.61 s, total: 17.8 s
Wall time: 19 s


<a id="secg"></a>

### <span style="color:darkred">G. Decision Tree</span>  <a href='#top'>(top)</a>

In [211]:
%%time
dt_data,dt = run_model(tree.DecisionTreeClassifier(criterion='entropy',max_depth=7),'Decision Tree')

Check for Overfitting: 0.8186826541359475

Test Score is: 0.645

                     precision    recall  f1-score   support

           Japanese       0.59      0.95      0.73       300
Traditional Chinese       0.87      0.34      0.49       300

        avg / total       0.73      0.65      0.61       600

CPU times: user 5.28 s, sys: 774 ms, total: 6.05 s
Wall time: 6.26 s


_Good visualization of important features and presentation of entropy weighting_

<a id="sech"></a>

### <span style="color:darkred">H. Random Forest</span>  <a href='#top'>(top)</a>

> Runs decision tree multiple times for best output <br>Longest processing time

In [212]:
%%time
rf_data,rf = run_model(ensemble.RandomForestClassifier(n_estimators=150,
                                                       criterion='entropy',
#                                                        max_features=len(X_train.columns),
                                                       max_depth=8),'Random Forest')

Check for Overfitting: 0.7609171190148262

Test Score is: 0.5133333333333333

                     precision    recall  f1-score   support

           Japanese       0.51      1.00      0.67       300
Traditional Chinese       1.00      0.03      0.05       300

        avg / total       0.75      0.51      0.36       600

CPU times: user 7.23 s, sys: 724 ms, total: 7.95 s
Wall time: 8.27 s


In [None]:
rf.feature_importances_
importance = dict(list(zip(X.columns,rf.feature_importances_)))
importance_sorted = sorted(importance, key=importance.get, reverse=True)
# for r in importance_sorted:
#     if importance[r] >0:
#         print(r, importance[r])
print(importance_sorted[0:100])

<a id="sec2"></a>

# <span style="color:darkblue">2. Model Comparison</span>  <a href='#top'>(top)</a>

In [203]:
model_set.columns = ['name','time','total','prec: | JA | CH | KO | EN |','rec: | JA | CH | KO | EN |','f1: | JA | CH | KO | EN |']
model_set

Unnamed: 0,name,time,total,prec: | JA | CH | KO | EN |,rec: | JA | CH | KO | EN |,f1: | JA | CH | KO | EN |
0,Logistic Regression,0.02,0.556667,"[0.84, 0.53]","[0.14, 0.97]","[0.24, 0.69]"
1,K Nearest Neighbor,0.04,0.493333,"[0.30, 0.50]","[0.01, 0.98]","[0.02, 0.66]"
2,Naive Bayes - Bernoulli,0.01,0.545,"[0.72, 0.53]","[0.15, 0.94]","[0.24, 0.67]"
3,Decision Tree,0.01,0.533333,"[0.67, 0.52]","[0.13, 0.93]","[0.22, 0.67]"
4,Random Forest,0.07,0.53,"[1.00, 0.52]","[0.06, 1.00]","[0.11, 0.68]"


-----

Regularization