In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import linregress
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB

from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten, concatenate, Conv2D
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json

Using TensorFlow backend.


In [2]:
def load_5models_from_disk():
    models = []
    for i in range(5):
        json_file = open("./model" + str(i) +".json", 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        # load weights into new model
        loaded_model.load_weights("temp" + str(i) +".hdf5")
        print("Loaded model from disk")

        # evaluate loaded model on test data
        loaded_model.compile(optimizer='rmsprop',
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
        models.append(loaded_model)
    return models

In [3]:
def shrink_array(array,size):
    
    ratio = float(len(array)) / float(size+1)
    res = []
    for i in range(size):
        res.append(np.mean(array[math.floor(i*ratio):math.ceil((i+2.0)*ratio)], axis = 0))
    return np.array(res)

In [4]:
train_data = pd.DataFrame.from_csv("../../data/database/baseline_train_data.csv")
test_data = pd.DataFrame.from_csv("../../data/database/baseline_test_data.csv")
train_data.columns

Index(['TIMESTAMP', 'SENSORTYPE', 'VALUES1', 'VALUES2', 'VALUES3', 'VALUES4',
       'VALUES5 ', 'TagName', 'tester_id', 'unix_timestamp'],
      dtype='object')

In [5]:
print(len(train_data))
print(len(test_data))

2992269
758768


In [6]:
train_data = train_data[train_data.SENSORTYPE == 1]
test_data = test_data[test_data.SENSORTYPE == 1]

In [7]:
feature_cols = ['VALUES1', 'VALUES2', 'VALUES3']

In [8]:
def get_feature_label(data, N):
    groups = data.groupby(['TagName','tester_id'])
    keys = groups.groups.keys()
    y = []
    X = []
    for k in keys:
        frame_feature = shrink_array(groups.get_group(k)[feature_cols].values, N)
        X.append(frame_feature)
        y.append(k[0])
    return np.array(X),np.array(y)

def get_feature_label_f(data, N):
    groups = data.groupby(['TagName','tester_id'])
    keys = groups.groups.keys()
    y = []
    X = []
    f = []
    for k in keys:
        frame_feature = shrink_array(groups.get_group(k)[feature_cols].values, N)
        X.append(frame_feature)
        y.append(k[0])
        f.append(f_df[(f_df.TagName == k[0]) & (f_df.tester_id == k[1])].d_change.values[0])
    return np.array(X),np.array(y),np.array(f)

In [9]:
def SVC_training(X_train, y_train,x_test, y_test, f_test):
    if len(X_train.shape) > 2:
        X_train = X_train.reshape(list(X_train.shape)[0],-1)
        x_test = x_test.reshape(list(x_test.shape)[0],-1)
    overal_max = []
    for k in ['poly','rbf','linear']:
        max_score = 0
        max_i = 0
        max_res = []
        for i in range(1,4):
            i = i/2
            clf4 = SVC(kernel=k, C=i, degree=3, verbose = False)

            clf4.fit(X_train, y_train) 
            # joblib.dump(clf4, '../../Results/baseline SVC 0.80 raw data acc with gyro 200 chunk.pkl') 
            res = clf4.predict(x_test)
            score = accuracy_score(y_test, res)
            if score>max_score:
                max_score = score
                max_i = i
                max_res = res
        print(k)
        print("max score: " + str(max_score) + " C = " + str(max_i))
        
        print(classification_report(y_test, max_res))
        print(confusion_matrix(y_test, max_res))
        if ensemble_06 is True:
            for i in range(len(max_res)):
                if (max_res[i] =='Tag0') or (max_res[i] == "Tag6"):
                    max_res[i] = rf_clf.predict(f_test[i])[0]
            print("-----ensembled---------")
            print(classification_report(y_test, max_res))
            print(confusion_matrix(y_test, max_res))
        overal_max.append(max_score)
    return overal_max
            
        
def RF_training(X_train, y_train,x_test, y_test, f_test):
    if len(X_train.shape) > 2:
        X_train = X_train.reshape(list(X_train.shape)[0],-1)
        x_test = x_test.reshape(list(x_test.shape)[0],-1)

        clf4 = RandomForestClassifier(n_estimators=30)

        clf4.fit(X_train, y_train) 
        # joblib.dump(clf4, '../../Results/baseline SVC 0.80 raw data acc with gyro 200 chunk.pkl') 
        res = clf4.predict(x_test)
        score = accuracy_score(y_test, res)
        
        print(classification_report(y_test, res))
        print(confusion_matrix(y_test, res))
        if ensemble_06 is True:
            for i in range(len(res)):
                if (res[i] =='Tag0') or (res[i] == "Tag6"):
                    res[i] = rf_clf.predict(f_test[i])[0]
            print("-----ensembled---------")
            print(classification_report(y_test, res))
            print(confusion_matrix(y_test, res))
        return(accuracy_score(y_test, res))

    
def ada_training(X_train, y_train,x_test, y_test, f_test):
    if len(X_train.shape) > 2:
        X_train = X_train.reshape(list(X_train.shape)[0],-1)
        x_test = x_test.reshape(list(x_test.shape)[0],-1)

        clf4 = AdaBoostClassifier(n_estimators=30)

        clf4.fit(X_train, y_train) 
        # joblib.dump(clf4, '../../Results/baseline SVC 0.80 raw data acc with gyro 200 chunk.pkl') 
        res = clf4.predict(x_test)
        score = accuracy_score(y_test, res)
        
        print(classification_report(y_test, res))
        print(confusion_matrix(y_test, res))
        if ensemble_06 is True:
            for i in range(len(res)):
                if (res[i] =='Tag0') or (res[i] == "Tag6"):
                    res[i] = rf_clf.predict(f_test[i])[0]
            print("-----ensembled---------")
            print(classification_report(y_test, res))
            print(confusion_matrix(y_test, res))
        return(accuracy_score(y_test, res))
    
def nb_training(X_train, y_train,x_test, y_test, f_test):
    if len(X_train.shape) > 2:
        X_train = X_train.reshape(list(X_train.shape)[0],-1)
        x_test = x_test.reshape(list(x_test.shape)[0],-1)

        clf4 = GaussianNB()

        clf4.fit(X_train, y_train) 
        # joblib.dump(clf4, '../../Results/baseline SVC 0.80 raw data acc with gyro 200 chunk.pkl') 
        res = clf4.predict(x_test)
        score = accuracy_score(y_test, res)
        
        print(classification_report(y_test, res))
        print(confusion_matrix(y_test, res))
        if ensemble_06 is True:
            for i in range(len(res)):
                if (res[i] =='Tag0') or (res[i] == "Tag6"):
                    res[i] = rf_clf.predict(f_test[i])[0]
            print("-----ensembled---------")
            print(classification_report(y_test, res))
            print(confusion_matrix(y_test, res))
        return(accuracy_score(y_test, res))


def DL_training(X_train, y_train,x_test, y_test, f_test):

    y = np.concatenate([y_train,y_test])
    tag_list = []
    for i in range(10):
        tag_list.append(['Tag'+str(i),i])
    for i in tag_list:
        tag_str = i[0]
        tag_int = i[1]
        y[y==tag_str] = tag_int
    y_categorical = to_categorical(y)
    
    y_train_cate = y_categorical[:len(y_train)]
    y_test_cate = y_categorical[len(y_train):]

    X_train = X_train.reshape(list(X_train.shape)[0],-1)
    x_test = x_test.reshape(list(x_test.shape)[0],-1)
    
    for i in range(5):
        # This returns a tensor
        inputs = Input(shape=(X_train.shape[1:]))

        # a layer instance is callable on a tensor, and returns a tensor
        layer1 = Dense(64, activation='relu')(inputs)
        layer2 = Dense(128, activation='relu')(layer1)
        layer3 = Dense(64, activation='relu')(layer2)
        layer4 = Dense(32, activation='relu')(layer3)
        predictions = Dense(10, activation='softmax')(layer4)

        mcp = ModelCheckpoint("./temp" + str(i) + ".hdf5", monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
        model = Model(inputs=inputs, outputs=predictions)
#         print(model.summary())
        model.compile(optimizer='rmsprop',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        model_his = model.fit(X_train, y_train_cate, batch_size=32, epochs=40, verbose = 0,
                              validation_data=(x_test, y_test_cate), callbacks = [mcp]
                             )  # starts training

        model_json = model.to_json()
        with open("./model" + str(i) +".json", "w") as json_file:
            json_file.write(model_json)
            
    models = load_5models_from_disk()
    scores = []
    tests = []
    predicts = []
    f = []
    for model in models:
        res = model.predict(x_test)
        scores.append(accuracy_score(np.argmax(y_test_cate,1), np.argmax(res, 1)))
        tests += np.argmax(y_test_cate,1).tolist()
        if ensemble_06 is True:
            f += f_test.tolist()
        predicts += np.argmax(res, 1).tolist()
    
    print(classification_report(tests, predicts))
    print(confusion_matrix(tests, predicts))
    if ensemble_06 is True:
        for i in range(len(predicts)):
            if (predicts[i] ==0) or (predicts[i] == 6):
                if rf_clf.predict(f[i])[0] == 'Tag0':
                    predicts[i] = 0
                else:
                    predicts[i] = 6
        print("-----ensembled---------")
        print(classification_report(tests, predicts))
        print(confusion_matrix(tests, predicts))
    return accuracy_score(tests, predicts)
    
def CONV1d_training(X_train, y_train,x_test, y_test, f_test):
    y = np.concatenate([y_train,y_test])
    tag_list = []
    for i in range(10):
        tag_list.append(['Tag'+str(i),i])
    for i in tag_list:
        tag_str = i[0]
        tag_int = i[1]
        y[y==tag_str] = tag_int
    y_categorical = to_categorical(y)
    
    y_train_cate = y_categorical[:len(y_train)]
    y_test_cate = y_categorical[len(y_train):]

    
    for i in range(5):
        input_val1 = Input(shape=X_train.shape[1:])

        con1 = Conv1D(filters=30,kernel_size=5)(input_val1)
        max_pooling_1d_1 = MaxPooling1D(pool_size=2, strides=None, padding='valid')(con1)
        flat_1 = Flatten()(max_pooling_1d_1)
        layer2 = Dense(128, activation='relu')(flat_1)
        layer4 = Dense(32, activation='relu')(layer2)
        predictions = Dense(y_categorical.shape[-1], activation='softmax')(layer4)

        model = Model(inputs = input_val1, outputs=predictions)
#         print(model.summary())
        mcp = ModelCheckpoint("./temp" + str(i) + ".hdf5", monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    
        model.compile(optimizer='rmsprop',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
    
        model_his = model.fit(X_train, y_train_cate, batch_size=32, epochs=40, verbose = 0,
                              validation_data=(x_test, y_test_cate), callbacks = [mcp]
                             )  # starts training

        model_json = model.to_json()
        with open("./model" + str(i) +".json", "w") as json_file:
            json_file.write(model_json)
            
    models = load_5models_from_disk()
    scores = []
    tests = []
    predicts = []
    f = []
    
    for model in models:
        res = model.predict(x_test)
        scores.append(accuracy_score(np.argmax(y_test_cate,1), np.argmax(res, 1)))
        tests += np.argmax(y_test_cate,1).tolist()
        predicts += np.argmax(res, 1).tolist()
        if ensemble_06 is True:
            f += f_test.tolist()
        
    print(classification_report(tests, predicts))
    print(confusion_matrix(tests, predicts))
    if ensemble_06 is True:
        for i in range(len(predicts)):
            if (predicts[i] ==0) or (predicts[i] == 6):
                if rf_clf.predict(f[i])[0] == 'Tag0':
                    predicts[i] = 0
                else:
                    predicts[i] = 6
        print("-----ensembled---------")
        print(classification_report(tests, predicts))
        print(confusion_matrix(tests, predicts))  
    return accuracy_score(tests, predicts)

In [10]:
ensemble_06 = False

## Train Baseline

In [11]:
N=14

X_train, y_train = get_feature_label(train_data, N)
x_test, y_test = get_feature_label(test_data, N)
f_test = []
print("RF:")
print(RF_training(X_train, y_train, x_test, y_test, f_test))
print("="*30)
print("SVC:")
print(SVC_training(X_train, y_train, x_test, y_test, f_test))
print("="*30)
print("DL:")
print(DL_training(X_train, y_train, x_test, y_test, f_test))
print("="*30)
print("CONV:")
print(CONV1d_training(X_train, y_train, x_test, y_test, f_test))
print("="*30)

RF:
             precision    recall  f1-score   support

       Tag0       0.47      0.73      0.57        22
       Tag1       0.86      0.55      0.67        22
       Tag2       0.57      0.73      0.64        22
       Tag3       0.74      0.64      0.68        22
       Tag4       1.00      0.77      0.87        22
       Tag5       0.64      0.73      0.68        22
       Tag6       0.62      0.45      0.53        22
       Tag7       0.47      0.68      0.56        22
       Tag8       0.71      0.68      0.70        22
       Tag9       0.93      0.59      0.72        22

avg / total       0.70      0.65      0.66       220

[[16  0  2  0  0  1  1  0  2  0]
 [ 0 12  1  1  0  0  1  7  0  0]
 [ 0  0 16  1  0  1  0  3  1  0]
 [ 0  1  4 14  0  1  0  2  0  0]
 [ 4  0  0  0 17  0  0  0  0  1]
 [ 1  0  1  2  0 16  0  1  1  0]
 [ 7  0  0  0  0  3 10  0  2  0]
 [ 0  1  2  0  0  2  2 15  0  0]
 [ 5  0  1  0  0  0  1  0 15  0]
 [ 1  0  1  1  0  1  1  4  0 13]]
0.654545454545
SVC:
poly
m