In [1]:
import pandas as pd
import spacy
import glob
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.externals import joblib
import sys

In [5]:
df = pd.read_csv('./cookie_tagged.csv', sep = ';')
df.head(0)

Unnamed: 0,label,text,pos_text,pos_,tag_text,tag_,dep_text,dep_,pos_text_complete,pos_complete,new_text,text_for_POS


In [6]:
X = df['pos_text_complete'] 
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
pipeline = Pipeline(steps=[('vect', CountVectorizer()), 
                           ('cls', SVC()), 
                    ])

parameters = {          
              'cls__C':  (0.001, 0.01, 1, 10), 
              'cls__kernel':  ['linear', 'poly', 'rbf'],
              'cls__degree' : (2,3,4),
              'cls__gamma' : [0.0001,0.001,0.01,0.1,1] 
              }

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           scoring='roc_auc',
                           verbose=3,
                           n_jobs = -1,
                           cv=10                        
                          )
                    

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  3.0min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'cls__C': (0.001, 0.01, 1, 10), 'cls__kernel': ['linear', 'poly', 'rbf'], 'cls__degree': (2, 3, 4), 'cls__gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [8]:
grid_search.best_params_

{'cls__C': 0.01,
 'cls__degree': 2,
 'cls__gamma': 0.0001,
 'cls__kernel': 'linear'}

In [9]:
best_svm = grid_search.best_estimator_

In [10]:
# Form a prediction set
predictions = best_svm.predict(X_test)

# Report the confusion matrix
confusion_matrix_model =(metrics.confusion_matrix(y_test,predictions)) 
target = ['Control', 'Dementia']

confusion_matrix_df = pd.DataFrame(confusion_matrix_model, index=target, columns=target)

confusion_matrix_df.columns.name = "Predicted"
confusion_matrix_df.index.name = "Real"

confusion_matrix_df

Predicted,Control,Dementia
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,35,6
Dementia,5,65


In [11]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86        41
           1       0.92      0.93      0.92        70

   micro avg       0.90      0.90      0.90       111
   macro avg       0.90      0.89      0.89       111
weighted avg       0.90      0.90      0.90       111



In [12]:
joblib.dump(best_svm, 'best_svm.sav' )

['best_svm.sav']

## Use plain text for App model


In [13]:
X = df['text_for_POS'] 
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipeline = Pipeline(steps=[('vect', CountVectorizer()), 
                           ('cls', SVC())])
                    
 

parameters = {
              'cls__C': [0.01],              
              'cls__kernel': ['linear'],          
              'cls__probability' : [True],
              'cls__class_weight' : ['balanced']
             }



grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=parameters,
                           scoring='roc_auc',
                           verbose=3,
                           n_jobs = -1,
                           cv=10                       
                          )
                         


grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.7s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.8s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'cls__C': [0.01], 'cls__kernel': ['linear'], 'cls__probability': [True], 'cls__class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [18]:
best_svm_app = grid_search.best_estimator_

In [19]:
# Form a prediction set
predictions = best_svm_app.predict(X_test)

# Report the confusion matrix
confusion_matrix_model =(metrics.confusion_matrix(y_test,predictions)) 
target = ['Control', 'Dementia']

confusion_matrix_df = pd.DataFrame(confusion_matrix_model, index=target, columns=target)

confusion_matrix_df.columns.name = "Predicted"
confusion_matrix_df.index.name = "Real"

confusion_matrix_df

Predicted,Control,Dementia
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,34,7
Dementia,5,65
