In [1]:
'''
# overview:
7 classes of tea
8 features from 8 smell sensors
100- frames sampling from sensors

total number of samples: 59, with 7 class of 9,9,9,9,9,9,5 samples

'''
import random
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import pandas as pd

# teaType = ['Luan','Huangshan','Taiping','Maoshan','Dinggu','Queshe','Biluochun']


def loadData(tensor=False,onehot=False,normalized=False,separated=False):
    tea_data = []
    tea_label = []   
    for i in range(7):
        fname = str(i+1)+'.csv'
        train = pd.read_csv(open(fname,'r'))
        temp_data = []
        for j in range(9):
            label = i
            tempdata = train.iloc[0:95,9*j:9*j+8].values # remove nan frames and crop data to the same length
            if tempdata.shape[1] != 0:
                # data augmentation
                for k in range(16):
                    data = tempdata[k:k+80]
                    if normalized:
                        scaler = StandardScaler()
                        data = scaler.fit_transform(data)
                    if separated:
                        for x in data:
                            if tensor:
                                tea_data.append(x)
                            else:
                                tea_data.append(x.reshape(1,-1)[0])
                            tea_label.append(label)
                    else:
                        if tensor:
                            tea_data.append(data)
                        else:
                            tea_data.append(data.reshape(1,-1)[0])
                        tea_label.append(label)
    if onehot:
        enc = OneHotEncoder()
        tea_label = enc.fit_transform(np.array(tea_label).reshape(-1,1)).toarray()
    return train_test_split(tea_data,tea_label,test_size=0.2, random_state=0)

In [8]:
# svm

from sklearn.svm import SVC

trainX,testX,trainY,testY = loadData()

svmparams = {'kernel':('linear','rbf','poly'),'C':[0.1,1.0,2.0,5.0,10.0]}
svmclf = SVC()
gsearch1 = GridSearchCV(svmclf,svmparams,scoring='accuracy',cv=3)
gsearch1.fit(trainX,trainY)
print(gsearch1.best_params_, gsearch1.best_score_)

# normalized data lead to worse result

{'kernel': 'poly', 'C': 2.0} 0.8145695364238411


In [9]:
svmclfr = SVC(kernel='poly',C=2.0)
svmclfr.fit(trainX,trainY)
print(metrics.accuracy_score(testY,svmclfr.predict(testX)))

0.7936507936507936


In [11]:
# rf

from sklearn.ensemble import RandomForestClassifier

trainX,testX,trainY,testY = loadData()

forestparams1 = {'n_estimators':range(10,200,10)}
forestclf1 = RandomForestClassifier(random_state=10)
gsearch1 = GridSearchCV(forestclf1,forestparams1,scoring='accuracy',cv=3)
gsearch1.fit(trainX,trainY)
print(gsearch1.best_params_, gsearch1.best_score_)

{'n_estimators': 40} 0.7973509933774835


In [12]:
forestparams2 = {'max_depth':range(2,14,2), 'min_samples_split':range(2,10,1)}
forestclf2 = RandomForestClassifier(n_estimators=40,random_state=10, oob_score=True)
gsearch2 = GridSearchCV(forestclf2,forestparams2,scoring='accuracy',cv=3)
gsearch2.fit(trainX,trainY)
print(gsearch2.best_params_, gsearch2.best_score_)

{'max_depth': 6, 'min_samples_split': 8} 0.8119205298013245


In [13]:
forestclfr = RandomForestClassifier(n_estimators=50,random_state=10, min_samples_split=8, max_depth=6, oob_score=True)
forestclfr.fit(trainX,trainY)
print(metrics.accuracy_score(testY,forestclfr.predict(testX)))

0.783068783068783


In [23]:
# mlp

from sklearn.neural_network import MLPClassifier

trainX,testX,trainY,testY = loadData(normalized=True)

mlpparams1 = {'hidden_layer_sizes':((100,30),(80,80,20),(64,32,8),(50,20),(40,30,20,10)),'activation':('relu','logistic','tanh')}
mlpclf1 = MLPClassifier(solver='lbfgs', random_state=1)
gsearch1 = GridSearchCV(mlpclf1,mlpparams1,scoring='accuracy',cv=3)
gsearch1.fit(trainX,trainY)
print(gsearch1.best_params_, gsearch1.best_score_)
# underfitting

{'activation': 'logistic', 'hidden_layer_sizes': (100, 30)} 0.5655629139072847


In [21]:
mlpparams2 = {'alpha':(0.0001,0.001,0.01,0.1,1,10)}
mlpclf2 = MLPClassifier(hidden_layer_sizes=(100, 30),activation='logistic',solver='lbfgs', random_state=1)
gsearch2 = GridSearchCV(mlpclf2,mlpparams2,scoring='accuracy',cv=3)
gsearch2.fit(trainX,trainY)
print(gsearch2.best_params_, gsearch2.best_score_)

{'alpha': 0.0001} 0.5655629139072847


In [22]:
mlpclfr = MLPClassifier(hidden_layer_sizes=(100,20),solver='lbfgs',activation='logistic', alpha=0.0001,random_state=1)
mlpclfr.fit(trainX,trainY)
print(metrics.accuracy_score(testY,mlpclfr.predict(testX)))

0.5925925925925926


In [2]:
# RNN

import tensorflow as tf
from tensorflow.contrib import  rnn

lr = 0.0001
batch_size = 32
epoch = 300
n_inputs = 8
n_steps = 80
n_hidden_units = 16
n_classes = 7

x = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_classes])

weights = {
    'in': tf.get_variable('in', shape=[n_inputs, n_hidden_units], initializer=tf.contrib.layers.xavier_initializer()),
    'out': tf.get_variable('out', shape=[n_hidden_units, n_classes], initializer=tf.contrib.layers.xavier_initializer()),
}
biases = {
    'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),
    'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))
}

def RNN(X, weights, biases):

    X = tf.reshape(X, [-1, n_inputs])
    X_in = tf.matmul(X, weights['in']) + biases['in']
    X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])
    
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden_units, forget_bias=1.0, state_is_tuple=True)
    init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)
    
    outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, X_in, initial_state=init_state, time_major=False)
    results = tf.matmul(final_state[1], weights['out']) + biases['out']
    
    return results

pred = RNN(x, weights, biases)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
train_op = tf.train.AdamOptimizer(lr).minimize(cost)
    
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [5]:
with tf.Session() as sess:
    train_x,test_x,train_y,test_y = loadData(onehot=True,tensor=True,normalized=True)
    batch_num = len(train_x)//batch_size
    sess.run(tf.global_variables_initializer())
    for step in range(epoch):
        for i in range(batch_num):
            batch_xs = train_x[i*batch_size:(i+1)*batch_size]
            batch_ys = train_y[i*batch_size:(i+1)*batch_size]
            sess.run([train_op], feed_dict={x: batch_xs,y: batch_ys,})
            if i == batch_num -1:
                print('train %d/%d: '%(step,epoch), sess.run([accuracy,cost], feed_dict={x: batch_xs,y: batch_ys,}))
    acc = 0
    batch_num = len(test_x)//batch_size
    for i in range(batch_num):
        batch_xs = test_x[i*batch_size:(i+1)*batch_size]
        batch_ys = test_y[i*batch_size:(i+1)*batch_size]
        acc = (acc + sess.run(accuracy, feed_dict={x:batch_xs,y:batch_ys}))/2
    print('test: ', acc)

train 0/300:  [0.125, 2.0357413]
train 1/300:  [0.15625, 2.0019906]
train 2/300:  [0.15625, 1.9691298]
train 3/300:  [0.15625, 1.9394873]
train 4/300:  [0.15625, 1.9148144]
train 5/300:  [0.1875, 1.8940129]
train 6/300:  [0.25, 1.8754876]
train 7/300:  [0.25, 1.8583013]
train 8/300:  [0.25, 1.8418925]
train 9/300:  [0.25, 1.8258897]
train 10/300:  [0.25, 1.810071]
train 11/300:  [0.28125, 1.794323]
train 12/300:  [0.3125, 1.778597]
train 13/300:  [0.3125, 1.7628777]
train 14/300:  [0.34375, 1.7471552]
train 15/300:  [0.3125, 1.7314084]
train 16/300:  [0.34375, 1.7156097]
train 17/300:  [0.34375, 1.6997384]
train 18/300:  [0.375, 1.68379]
train 19/300:  [0.375, 1.6677814]
train 20/300:  [0.375, 1.6517513]
train 21/300:  [0.40625, 1.6357563]
train 22/300:  [0.5, 1.6198637]
train 23/300:  [0.46875, 1.6041423]
train 24/300:  [0.53125, 1.5886499]
train 25/300:  [0.53125, 1.5734253]
train 26/300:  [0.59375, 1.5584843]
train 27/300:  [0.59375, 1.5438225]
train 28/300:  [0.59375, 1.5294195]
tr

train 234/300:  [0.75, 0.45843172]
train 235/300:  [0.75, 0.45746976]
train 236/300:  [0.75, 0.4565127]
train 237/300:  [0.75, 0.45556012]
train 238/300:  [0.75, 0.45461145]
train 239/300:  [0.75, 0.45366666]
train 240/300:  [0.75, 0.4527254]
train 241/300:  [0.75, 0.45178747]
train 242/300:  [0.75, 0.45085257]
train 243/300:  [0.75, 0.44992042]
train 244/300:  [0.75, 0.44899058]
train 245/300:  [0.75, 0.448063]
train 246/300:  [0.75, 0.44713748]
train 247/300:  [0.75, 0.44621402]
train 248/300:  [0.75, 0.44529292]
train 249/300:  [0.75, 0.44437438]
train 250/300:  [0.75, 0.44345802]
train 251/300:  [0.75, 0.4425453]
train 252/300:  [0.75, 0.44163412]
train 253/300:  [0.75, 0.44072688]
train 254/300:  [0.75, 0.4398219]
train 255/300:  [0.75, 0.4389173]
train 256/300:  [0.71875, 0.4380188]
train 257/300:  [0.71875, 0.43711418]
train 258/300:  [0.71875, 0.43620765]
train 259/300:  [0.71875, 0.43528464]
train 260/300:  [0.71875, 0.43438828]
train 261/300:  [0.71875, 0.43342698]
train 262/

In [2]:
# CNN

import tensorflow as tf
import tensorflow.contrib.slim as slim

batch_size = 16
epoch = 500

train_x,test_x,train_y,test_y = loadData(onehot=True,tensor=True,normalized=True)
batch_num = len(train_x)//batch_size

def conv1d(x,out_c,name='conv1d'):
    with tf.variable_scope(name):
        w = tf.get_variable('w',[5,x.get_shape()[-1],out_c],
                            initializer=tf.contrib.layers.xavier_initializer(),
                           regularizer = tf.contrib.layers.l2_regularizer(0.1))
        conv = tf.nn.conv1d(x,w,stride=4,padding='SAME')
        b = tf.get_variable('b',[out_c],initializer=tf.constant_initializer(0.0))
        #conv = tf.reshape(tf.nn.bias_add(conv,b),conv.get_shape())
    return tf.add(conv,b)

def pool1d(x):
    return tf.nn.pool(x,window_shape=[4],strides=[3],pooling_type='MAX',padding='SAME')

x = tf.placeholder(tf.float32, [None, 80, 8])
y = tf.placeholder(tf.float32, [None, 7])
keep_prob = tf.placeholder(tf.float32)

def CNN(x):
    # 80 x 8
    net = tf.nn.relu(tf.contrib.layers.batch_norm(conv1d(x,16,name='conv1')))
    # 20 x 16
    net = pool1d(net)
    # 7 x 16
    net = tf.nn.relu(tf.contrib.layers.batch_norm(conv1d(net,32,name='conv2')))
    # 2 x 32
    net = pool1d(net)
    # 1 x 32
    net = slim.flatten(net)
    net = slim.fully_connected(net,16)
    net = slim.dropout(net,keep_prob)
    net = slim.fully_connected(net,7)
    
    return net

pred = CNN(x)

#global_step = tf.Variable(0)
#lr = tf.train.exponential_decay(1e-4, global_step, batch_num, 0.96, staircase=True)

loss = slim.losses.softmax_cross_entropy(pred, y)
op = tf.train.AdamOptimizer(0.001).minimize(loss)

correct_prediction = tf.equal(tf.argmax(pred,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

Instructions for updating:
Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Instructions for updating:
Use tf.losses.compute_weighted_loss instead.
Instructions for updating:
Use tf.losses.add_loss instead.


In [3]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for step in range(epoch):
        for i in range(batch_num):
            batch_xs = train_x[i*batch_size:(i+1)*batch_size]
            batch_ys = train_y[i*batch_size:(i+1)*batch_size]
            sess.run(op, feed_dict={x: batch_xs,y: batch_ys,keep_prob:0.3})
            if i == batch_num -1:
                print('train %d/%d: '%(step,epoch), sess.run([accuracy,loss], feed_dict={x: batch_xs,y: batch_ys,keep_prob:1.0}))
    
    acc = 0
    batch_num = len(test_x)//batch_size
    for i in range(batch_num):
            batch_xs = test_x[i*batch_size:(i+1)*batch_size]
            batch_ys = test_y[i*batch_size:(i+1)*batch_size]
            acc = (acc + sess.run(accuracy, feed_dict={x:batch_xs,y:batch_ys,keep_prob:1.0}))/2
    print('test: ',acc)

train 0/500:  [0.125, 1.939223]
train 1/500:  [0.1875, 1.8997973]
train 2/500:  [0.1875, 1.8687615]
train 3/500:  [0.375, 1.8038554]
train 4/500:  [0.4375, 1.7858467]
train 5/500:  [0.5625, 1.7152777]
train 6/500:  [0.5625, 1.6712673]
train 7/500:  [0.5625, 1.637326]
train 8/500:  [0.5625, 1.5189373]
train 9/500:  [0.5625, 1.4436007]
train 10/500:  [0.5625, 1.4394403]
train 11/500:  [0.625, 1.3627715]
train 12/500:  [0.625, 1.2818942]
train 13/500:  [0.625, 1.2383484]
train 14/500:  [0.6875, 1.180142]
train 15/500:  [0.8125, 1.1107907]
train 16/500:  [0.8125, 1.0571201]
train 17/500:  [0.75, 1.0051222]
train 18/500:  [0.8125, 0.9657849]
train 19/500:  [0.75, 0.9526783]
train 20/500:  [0.6875, 0.92737615]
train 21/500:  [0.75, 0.896422]
train 22/500:  [0.75, 0.8548657]
train 23/500:  [0.75, 0.8114078]
train 24/500:  [0.75, 0.80449986]
train 25/500:  [0.8125, 0.76068234]
train 26/500:  [0.8125, 0.71415603]
train 27/500:  [0.8125, 0.6904806]
train 28/500:  [0.75, 0.7035519]
train 29/500: 

train 234/500:  [0.8125, 0.261355]
train 235/500:  [0.8125, 0.2603457]
train 236/500:  [0.8125, 0.26147807]
train 237/500:  [0.6875, 0.2730682]
train 238/500:  [0.6875, 0.2724404]
train 239/500:  [0.875, 0.26033753]
train 240/500:  [0.6875, 0.26709843]
train 241/500:  [0.6875, 0.2672644]
train 242/500:  [0.6875, 0.2718887]
train 243/500:  [0.6875, 0.2784115]
train 244/500:  [0.6875, 0.26668894]
train 245/500:  [0.6875, 0.2759148]
train 246/500:  [0.6875, 0.26443458]
train 247/500:  [0.6875, 0.274247]
train 248/500:  [0.6875, 0.27983356]
train 249/500:  [0.6875, 0.28095835]
train 250/500:  [0.6875, 0.27736297]
train 251/500:  [0.6875, 0.28025395]
train 252/500:  [0.6875, 0.27016342]
train 253/500:  [0.875, 0.26248163]
train 254/500:  [0.6875, 0.2678308]
train 255/500:  [0.6875, 0.27440205]
train 256/500:  [0.6875, 0.27473158]
train 257/500:  [0.75, 0.26809323]
train 258/500:  [0.6875, 0.27841902]
train 259/500:  [0.75, 0.2716583]
train 260/500:  [0.75, 0.2626897]
train 261/500:  [0.9375

train 459/500:  [0.6875, 0.2753654]
train 460/500:  [0.6875, 0.26638442]
train 461/500:  [0.6875, 0.27224737]
train 462/500:  [0.6875, 0.27342492]
train 463/500:  [0.6875, 0.26373065]
train 464/500:  [0.9375, 0.25627273]
train 465/500:  [0.75, 0.26092836]
train 466/500:  [0.6875, 0.2659598]
train 467/500:  [0.6875, 0.26901832]
train 468/500:  [0.6875, 0.2699873]
train 469/500:  [0.6875, 0.26697087]
train 470/500:  [0.6875, 0.27327275]
train 471/500:  [0.6875, 0.26382732]
train 472/500:  [0.6875, 0.2757343]
train 473/500:  [0.6875, 0.27055293]
train 474/500:  [0.6875, 0.2702996]
train 475/500:  [0.9375, 0.25451174]
train 476/500:  [0.6875, 0.26605788]
train 477/500:  [0.6875, 0.27343285]
train 478/500:  [0.6875, 0.26310217]
train 479/500:  [0.6875, 0.26542437]
train 480/500:  [0.6875, 0.2615328]
train 481/500:  [0.6875, 0.26810747]
train 482/500:  [0.6875, 0.27055216]
train 483/500:  [0.6875, 0.27959013]
train 484/500:  [0.6875, 0.27411544]
train 485/500:  [0.6875, 0.26483142]
train 486