###### <span>Part 1: Exploration <br><a href="https://kimrharper.github.io/port3a.html"> https://kimrharper.github.io/port3a.html</a> </span><br><br><span>Part 2: Analysis <br><a href="https://kimrharper.github.io/port3b.html"> https://kimrharper.github.io/port3b.html</a> </span><br><br><span>Part 3: Models <br><a href="https://kimrharper.github.io/port3c.html"> https://kimrharper.github.io/port3c.html</a> </span>

----

# <span style="color:darkred">Neural Network Assessment of ELL Blog Writing Samples</span>

### <span style="color:darkred">Part 1: </span><span style="color:darkblue">Exploration</span>

__Author:__ Ryan Harper 

----

<a id="top"></a>

<a href='#ov'>Overview</a><br>
<a href='#exp'>Experiment</a><br>
<a href='#sec1'>1. Cleaning Data</a><br>
<a href='#sec2'>2. Exploring the Data</a><br>

<a id="ov"></a>

<a id="sec1"></a>

__Goals__

# <span style="color:darkblue">1. Cleaning the Data</span>  <a href='#top'>(top)</a>

In [1]:
# from nltk.corpus import brown
# nltk.download('brown')

In [2]:
import pandas as pd
from __future__ import print_function

# iPython/Jupyter Notebook
import time
from pprint import pprint
import warnings
from IPython.display import Image

# Data processing
import scipy
import pandas as pd
import plotly as plo
import numpy as np
import seaborn as sns
from collections import Counter
from functools import reduce
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split



# Neural Network
import keras
from keras.optimizers import RMSprop
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils


plt.rcParams['figure.figsize'] = (7,7) # Make the figures a bit bigger


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [3]:
%store -r reduced_blog_set

In [4]:
reduced_blog_set.head(1)

Unnamed: 0,id,content,language,letters_per,doc,len,tokens,lemma,pos,deps,ents,word_vector
16473,4858,"Tonight,I was driving my car with having a che...",Japanese,0.74026,"(Tonight, ,, I, was, driving, my, car, with, h...",83,"[Tonight, ,, I, was, driving, my, car, with, h...","[tonight, ,, -PRON-, be, drive, -PRON-, car, w...","[NOUN, PUNCT, PRON, VERB, VERB, ADJ, NOUN, ADP...","[npadvmod, punct, nsubj, aux, ROOT, poss, dobj...","[TIME, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[8, 10, 15, 9, 9, 14, 8, 13, 9, 17, 8, 13, 8,..."


In [5]:
reduced_blog_set.language = pd.Categorical(reduced_blog_set.language)
analysis = reduced_blog_set[['word_vector','language']].copy()
analysis.language = analysis.language.cat.codes

__Create Train/Test X and Y__

In [6]:
train, test = train_test_split(analysis, test_size=0.25)

In [7]:
X_train = np.array(train.word_vector.tolist())
X_train = np.array([x[0] for x in X_train]).astype('float32')
y_train = np.array(train.language.tolist())

X_test = np.array(test.word_vector.tolist())
X_test = np.array([x[0] for x in X_test]).astype('float32')
y_test = np.array(test.language.tolist())

In [8]:
# convert class vectors to binary class matrices (one-hot encoding)
num_classes = 2

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [9]:
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(150,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              154624    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1026      
Total params: 680,450
Trainable params: 680,450
Non-trainable params: 0
_________________________________________________________________


In [11]:
batch_size = 4
epochs = 5

In [12]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, y_test))

score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 7503 samples, validate on 2502 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.5980270534491748
Test accuracy: 0.7521982414545201


<a id="sec3"></a>

In [13]:
check = model.get_weights()

In [14]:
check

[array([[-0.05813242, -0.06727889, -0.05586933, ..., -0.00870646,
         -0.04368983,  0.01819464],
        [-0.01378072,  0.00618158, -0.07300171, ..., -0.03437011,
         -0.01928937,  0.03958514],
        [ 0.03555014,  0.00278363, -0.03057074, ..., -0.05624048,
         -0.02501664, -0.05119778],
        ...,
        [ 0.0740506 ,  0.04199951, -0.01400962, ...,  0.0344005 ,
          0.05229729,  0.05675336],
        [ 0.02250485,  0.07092246, -0.0282635 , ...,  0.05806912,
         -0.04204299,  0.03883021],
        [-0.03376758, -0.00431921, -0.05373761, ...,  0.02275777,
          0.03775488,  0.00854272]], dtype=float32),
 array([-0.0131599 ,  0.01078281, -0.00944322, ..., -0.01819945,
        -0.01264776, -0.00976497], dtype=float32),
 array([[-4.0086430e-02,  5.3369977e-02,  7.3805220e-02, ...,
         -3.4821422e-03, -1.6290834e-04, -3.8830652e-03],
        [-3.0114235e-02, -5.1553588e-02, -3.6247768e-02, ...,
         -1.8239731e-02, -7.6458968e-02,  3.9252944e-02],
  