In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics

### Read data - Window

In [2]:
df1 = pd.read_csv('p7_master_window.csv')
df2 = pd.read_csv('p9_master_window.csv')
df3 = df1.append(df2,sort=True,ignore_index=True)
df4 = pd.read_csv('p5_master_window.csv')
#df3.head()

##### P7,9- Train
##### P5 - Test

In [3]:
X_train = df3.drop(['engagement'],axis = 1)
y_train = df3['engagement']
X_test = df4.drop(['engagement'],axis = 1)
y_test = df4['engagement']

In [4]:
print y_train.value_counts()
print y_test.value_counts()

0.0    24341
1.0    22912
Name: engagement, dtype: int64
1.0    11339
0.0     6400
Name: engagement, dtype: int64


#### Transforming the data to CRFsuite format

In [5]:
trainx = []
train_temp = []
train_sess_index = []
train_index_curr = X_train['session_num'][0]
for item in X_train.iterrows():
    if(item[1]['session_num'] != train_index_curr):
        trainx.append(train_temp)
        train_temp = []
        train_sess_index.append(item[0]) 
        train_index_curr = item[1]['session_num']
    temp = []
    for val in item[1].items():
        s = val[0] + '=' + str(val[1])
        temp.append(s)
    train_temp.append(temp)
trainx.append(train_temp)

testx = []
test_temp = []
test_sess_index = []
test_index_curr = X_test['session_num'][0]
for item in X_test.iterrows():
    if(item[1]['session_num'] != test_index_curr):
        testx.append(test_temp)
        test_temp = []
        test_sess_index.append(item[0]) 
        test_index_curr = item[1]['session_num']
    temp = []
    for val in item[1].items():
        s = val[0] + '=' + str(val[1])
        temp.append(s)
    test_temp.append(temp)
testx.append(test_temp)
    
trainy = []
trctr = 0
trtemp = []
for item in y_train:
    if(trctr in train_sess_index):
        trainy.append(trtemp)
        trtemp = []
    s = str(int(item))
    trtemp.append(s)
    trctr+=1
trainy.append(trtemp)

testy = []
tectr = 0
tetemp = []
for item in y_test:
    if(tectr in test_sess_index):
        testy.append(tetemp)
        tetemp = []
    s = str(int(item))
    tetemp.append(s)
    tectr+=1
testy.append(tetemp)    

### CRF

In [6]:
#from __future__ import print_function
from itertools import chain
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite

In [7]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(trainx, trainy):
    trainer.append(xseq, yseq)

CPU times: user 5.63 s, sys: 82.5 ms, total: 5.71 s
Wall time: 5.71 s


In [8]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 300,  # stop earlier
    #'feature.possible_transitions': True
})

In [9]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [10]:
trainer.train('p579_s.crfsuite')

In [11]:
#!ls -lh ./p5_test_s1.crfsuite

In [12]:
trainer.logparser.last_iteration

{'active_features': 3896,
 'error_norm': 366.733044,
 'feature_norm': 24.327539,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 2171.268692,
 'num': 300,
 'scores': {},
 'time': 0.272}

In [13]:
tagger = pycrfsuite.Tagger()
tagger.open('p579_s.crfsuite')

<contextlib.closing at 0x7f1c64f61210>

### Classification Accuracy

In [14]:
pred = []
for item in testx:
        pred.append((tagger.tag(item)))

In [15]:
pred_ravel = []
for i in pred:
    for j in i:
        pred_ravel.append(int(j))

y_test_true = []
for item in testy:
    for j in item:
        y_test_true.append(int(j))
#set(y_test_true)

In [16]:
print(metrics.classification_report(y_test_true,pred_ravel))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      6400
           1       0.87      0.89      0.88     11339

   micro avg       0.84      0.84      0.84     17739
   macro avg       0.83      0.83      0.83     17739
weighted avg       0.84      0.84      0.84     17739



In [17]:
print(metrics.accuracy_score(y_test_true,pred_ravel))

0.844128755848695


In [18]:
print(metrics.confusion_matrix(y_test_true,pred_ravel))

[[ 4924  1476]
 [ 1289 10050]]


In [19]:
print(metrics.roc_auc_score(y_test_true,pred_ravel))

0.827848272554899
