### Utterances por frase sin simbolos


In [3]:
import pandas as pd
from pathlib import Path

In [4]:
pitt_path = Path('../data/Pitt')
model_path = pitt_path.parent/'models'

In [5]:
pitt_df = pd.read_csv(model_path/'pitt-cookie-complete.csv')
pitt_df.head()

Unnamed: 0.1,Unnamed: 0,group,sentences,sentences_clean,text,clean
0,0,control,['the scene is <in the> [/] in the kitchen . '...,"['the scene is in the kitchen .', 'the mother ...",the scene is <in the> [/] in the kitchen . th...,the scene is in the kitchen . the mother is wi...
1,1,control,"['oh I see the sink is running over . ', 'I se...","['oh I see the sink is running over .', 'I see...",oh I see the sink is running over . I see the...,oh I see the sink is running over . I see the ...
2,2,control,['&um a boy and a girl are in the kitchen with...,['a boy and a girl are in the kitchen with the...,&um a boy and a girl are in the kitchen with t...,a boy and a girl are in the kitchen with their...
3,3,control,"['okay . [+ exc] ', 'it was summertime and mot...","['okay .', 'it was summertime and mother and t...",okay . [+ exc] it was summertime and mother a...,okay . it was summertime and mother and the ch...
4,4,control,['&=clears:throat wait (un)til I put my glasse...,"['wait until I put my glasses on .', ""oh ‡ the...",&=clears:throat wait (un)til I put my glasses ...,wait until I put my glasses on . oh ‡ there's ...


In [6]:
len(pitt_df)

552

In [7]:
from sklearn.model_selection import train_test_split

TEXT_COL = 'clean'    # 'text' for annotated
test_split = 0.2

df = pitt_df.copy()

X = df[TEXT_COL]
y = df['group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=42)

## Bag of Words / Linear Classifier

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(441, 1492)

In [15]:
text_clf = Pipeline([('counts', CountVectorizer()),
                     ('classifier', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train) 



Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        str...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [16]:
# Form a prediction set
predictions = text_clf.predict(X_test)
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[46  8]
 [18 39]]


In [17]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

     control       0.72      0.85      0.78        54
    dementia       0.83      0.68      0.75        57

   micro avg       0.77      0.77      0.77       111
   macro avg       0.77      0.77      0.76       111
weighted avg       0.78      0.77      0.76       111



In [18]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7657657657657657


### Grid Search

In [22]:
pipeline = Pipeline(steps=[('vect', CountVectorizer()),
                           ('cls', LinearSVC()),
                    ])
 
#Aqui definimos el espacio de parámetros a explorar
parameters = {'vect__max_df': [1.9],
              'vect__max_features': [1000],
              'vect__min_df': [50],
              'vect__ngram_range': [(1, 1)],
              'cls__C': (0.001, 0.0015, 0.01, 1, 10, 100),
              'cls__loss': ['squared_hinge'],
              'cls__max_iter': [1000],
            }

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           scoring='roc_auc',
                           verbose=3,
                           n_jobs = -1,
                           cv=20                          
                          )

grid_search.fit(X_train, y_train)

Fitting 20 folds for each of 6 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.7s finished


GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': [1.9], 'vect__max_features': [1000], 'vect__min_df': [50], 'vect__ngram_range': [(1, 1)], 'cls__C': (0.001, 0.0015, 0.01, 1, 10, 100), 'cls__loss': ['squared_hinge'], 'cls__max_iter': [1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [23]:
grid_search.best_params_

{'cls__C': 0.0015,
 'cls__loss': 'squared_hinge',
 'cls__max_iter': 1000,
 'vect__max_df': 1.9,
 'vect__max_features': 1000,
 'vect__min_df': 50,
 'vect__ngram_range': (1, 1)}

In [24]:
best_grid_search = grid_search.best_estimator_
best_grid_search

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.9, max_features=1000, min_df=50,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [25]:
# Form a prediction set
predictions = best_grid_search.predict(X_test)
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[42 12]
 [11 46]]


In [26]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

     control       0.79      0.78      0.79        54
    dementia       0.79      0.81      0.80        57

   micro avg       0.79      0.79      0.79       111
   macro avg       0.79      0.79      0.79       111
weighted avg       0.79      0.79      0.79       111



In [27]:
print(metrics.accuracy_score(y_test,predictions))

0.7927927927927928


### Persist the model

In [28]:
X_test[:5]

547    well the little kid's falling off his stool . ...
81     the little girl's pointing to her mouth . she ...
140    climbing . dishwashing . pointing . stealing c...
79     mhm . there's a boy and a girl and the boy is ...
272    okay a child falling off a stool in the attemp...
Name: clean, dtype: object

In [30]:
from sklearn.externals import joblib

In [31]:
# Use protocol version 2 because I need to run coremltools in Python 2.7
joblib.dump(best_grid_search, '../data/models/scikit-svc-bow.pkl', protocol=2)

['../data/models/scikit-svc-bow.pkl']

In [74]:
best_grid_search.predict(X_test[:10])

array(['control', 'control', 'control', 'control', 'dementia', 'control',
       'dementia', 'dementia', 'control', 'dementia'], dtype=object)

In [77]:
X_test.iloc[4]

"okay a child falling off a stool in the attempt to reach the cookie jar which it looks like he's knocked the lid off . and maybe he's gonna drop the cookie . and his girlfriend is standing there beckoning him or reaching for the cookie that he's sposta give her . and the mother is not paying any attention to the kids . she's looking out the window and drying the dishes . at the same time she has let the sink run over and the water is cascading down on the floor and onto her feet . the curtains are waving and the window is open . and the yard looks like it's manicured . and other than the stool and the cabinets and the dishes that's all I can see ."

In [36]:
best_grid_search.predict(["theres a little boys takin"])

array(['dementia'], dtype=object)

In [66]:
y_test[:5]

547    dementia
81      control
140     control
79      control
272    dementia
Name: group, dtype: object

In [38]:
model = best_grid_search

In [48]:
model.steps[0]

('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.9, max_features=1000, min_df=50,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))

In [49]:
_, vectorizer = model.steps[0]

In [52]:
vectorizer.vocabulary_

{'all': 2,
 'the': 95,
 'action': 1,
 'okay': 67,
 'it': 51,
 'boy': 10,
 'and': 3,
 'girl': 34,
 'well': 109,
 'they': 98,
 'falling': 28,
 'down': 24,
 'in': 48,
 'here': 44,
 'this': 100,
 'water': 108,
 'be': 8,
 'going': 35,
 'there': 97,
 'but': 11,
 'on': 68,
 'getting': 33,
 'something': 89,
 'to': 101,
 'is': 50,
 'what': 110,
 'that': 94,
 'looks': 60,
 'some': 88,
 'like': 57,
 'yeah': 113,
 'reaching': 79,
 'cookie': 13,
 'jar': 52,
 'stool': 92,
 'over': 76,
 'her': 43,
 'hand': 39,
 'up': 104,
 'for': 30,
 'cookies': 14,
 'mother': 61,
 'overflowing': 77,
 'from': 31,
 'sink': 85,
 'she': 84,
 'drying': 25,
 'dishes': 20,
 'looking': 59,
 'out': 73,
 'window': 111,
 'floor': 29,
 'curtains': 17,
 'are': 5,
 'you': 114,
 'can': 12,
 'see': 83,
 'outside': 75,
 'dish': 19,
 'about': 0,
 'fall': 27,
 'off': 65,
 'of': 64,
 'washing': 107,
 'spilling': 90,
 'kitchen': 54,
 'little': 58,
 'at': 7,
 'taking': 93,
 'out_of': 74,
 'think': 99,
 'he': 42,
 'standing': 91,
 'handin

### Export the classifier itself and the vocabulary

In [54]:
_, classifier = model.steps[1]
classifier

LinearSVC(C=0.0015, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [56]:
# Use protocol version 2 because I need to run coremltools in Python 2.7
joblib.dump(classifier, '../data/models/scikit-svc-bow-classifier.pkl', protocol=2)

['../data/models/scikit-svc-bow-classifier.pkl']

In [60]:
# Convert vocab frequencies from int64 to number so they can be json-serialized
vocab = vectorizer.vocabulary_
vocab = {k: int(v) for k, v in vocab.items()}

In [61]:
import json
with open('../data/models/scikit-svc-bow-vocab.txt', 'w+') as f:
    f.write(json.dumps(vocab))

In [62]:
len(vocab)

115

The classifier was converted to CoreML externally - conversion doesn't work in Python3.

### Some app results differ

In [82]:
analyzer = vectorizer.build_analyzer()

In [118]:
vector_scikit = vectorizer.transform([X_test.iloc[4]]).todense()
vector_scikit

matrix([[ 0,  0,  1, 11,  0,  1,  0,  1,  0,  0,  0,  0,  1,  3,  0,  0,
          0,  1,  0,  0,  2,  0,  0,  0,  1,  1,  0,  0,  1,  1,  1,  0,
          0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  3,  2,  0,  1,  1,  0,
          1,  0,  4,  2,  1,  0,  0,  0,  1,  2,  0,  1,  2,  1,  0,  1,
          0,  2,  0,  1,  1,  0,  1,  1,  1,  1,  0,  0,  1,  0,  0,  1,
          0,  0,  0,  1,  2,  1,  0,  0,  0,  0,  0,  1,  2,  0,  2, 19,
          0,  1,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  1,  0,  0,  2,
          0,  0,  0]])

In [121]:
vector_from_app = [[0,0,1,11,0,1,0,1,0,0,0,0,1,3,0,0,0,1,0,0,2,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0,2,0,1,1,0,1,0,4,1,1,0,0,0,1,2,0,1,2,1,0,1,0,2,0,1,1,0,1,1,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,0,0,1,2,0,1,19,0,1,0,0,0,2,0,0,0,0,0,0,1,0,0,2,0,0,0]]

From app:
```
[0,0,1,11,0,1,0,1,0,0,0,0,1,3,0,0,0,1,0,0,2,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0,2,0,1,1,0,1,0,4,1,1,0,0,0,1,2,0,1,2,1,0,1,0,2,0,1,1,0,1,1,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,0,0,1,2,0,1,19,0,1,0,0,0,2,0,0,0,0,0,0,1,0,0,2,0,0,0]
```

In [122]:
classifier.predict(vector_from_app)

array(['control'], dtype=object)

In [98]:
model.predict([X_test.iloc[4]])

array(['dementia'], dtype=object)

In [123]:
classifier.predict(vector_scikit)

array(['dementia'], dtype=object)

The vector are slightly different. In-app tokenization is wrong for some reason.