### For requriments, please refer to imports

In [73]:
import pandas as pd
import sklearn
from sklearn import mixture
from sklearn import preprocessing
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm
import time
import random
import pickle
import csv
%matplotlib inline
#here we define our "constants"
train_data = r"/home/yuri/datacombats/data/train"
labels_data = r"/home/yuri/datacombats/data/train/labels"
predictions_data = r"/home/yuri/datacombats/data/test/prediction"
test_data = r"/home/yuri/datacombats/data/test"

#where predictions will be stored. IMPORTANT!!! change the name when changing the model
#my convention is "prediction_{month}_{day}_{count of samples on which GMMs are trained_{strategy for sklearn imputer}"
predictions_save_folder = r"/home/yuri/datacombats/data/test/prediction_10_30_all_median"
#where predictions will be stored. IMPORTANT!!! change the name when changing the model
models_save_folder = r"/home/yuri/datacombats/data/test/10_30_all_median_models"
#where predictions will be stored. IMPORTANT!!! change the name when changing the model
preds_save_path = r"/home/yuri/datacombats/data/test/10_30_all_median_preds"

if not os.path.exists(predictions_save_folder):
    os.mkdir(predictions_save_folder)
if not os.path.exists(models_save_folder):
    os.mkdir(models_save_folder)

In [74]:
def replace(s):
    t = []
    for i in range(0, len(s[:,0])):
        t.append(np.sum(s[i, 1:]))
        if t[i] == 0:
            s[i, 1:] = np.full((1, len(s[i, 1:])), None)
    return s

def get_data_from_file(path, delimiter = ','):
    if os.path.exists(path):
        res = genfromtxt(path, delimiter=delimiter, skip_header=True)
        res = replace(res)
    else:
        res = None
          
    return res

def extend(first, second):
    import math
    a, b = None, None
    first_longer = False
    #this block makes function symmetrical 
    if len(first) > len(second):
        a = first
        b = second
        first_longer = True
    elif len(first) < len(second):
        a = second
        b = first
    else:
        return (first, second)
    
    b_ext = []
    b_ind = 0
    for item in a:
        #if values in arrs are equal, take current value and inc counter to pick next value next time
        if item[0] == b[b_ind][0]:
            b_ext.append(b[b_ind])
            if b_ind != len(b) - 1:
                b_ind += 1
        #if value in long arrs is lower than the one we extend, just take the same value as in prev iter
        elif item[0] < b[b_ind][0]:
            b_ext.append(b[b_ind - 1])
        #if bigger than update counter first and take new value
        else: 
            if b_ind != len(b) - 1:
                b_ind += 1
            b_ext.append(b[b_ind])
            
    return (a, np.array(b_ext)) if first_longer else (np.array(b_ext),a)

### First, we need to load the data 

In [16]:
from numpy import genfromtxt
eyes_len = 6
kinect_len = 27
audio_len = 36
face_len = 100

train = None
files = []

#find unique file names
for item in os.listdir(os.path.join(train_data, "eyes")):
    files.append(item)
for item in os.listdir(os.path.join(train_data, "kinect")):
    files.append(item)
for item in os.listdir(os.path.join(train_data, "audio")):
    files.append(item)
for item in os.listdir(os.path.join(train_data, "face_nn")):
    files.append(item)
files = list(set(files))

for file in tqdm(files):        
    eyes_path = os.path.join(train_data, "eyes", file)
    kinect_path = os.path.join(train_data, "kinect", file)
    audio_path = os.path.join(train_data, "audio", file)
    face_nn_path = os.path.join(train_data, "face_nn", file)
    
    #get all data sources available. 
    #if for some source file is missing, set to None. If part of file is missing, set to None
    #(refer to get_data_from_file func) 
    eyes_data = get_data_from_file(eyes_path)
    audio_data = get_data_from_file(audio_path)
    face_nn_data = get_data_from_file(face_nn_path)
    kinect_data = get_data_from_file(kinect_path)
    #get labels with timestamps, we will need this to align results
    labels_raw = genfromtxt(os.path.join(labels_data, file), delimiter=',', skip_header=True)[:, 0:7]
    labels_raw[:, 1] = np.dot(labels_raw[:, 1:7], range(0, 6))
    labels = labels_raw[:, 0:2]
    arr = []
    
    if eyes_data is not None:
        arr.append(eyes_data)
    if audio_data is not None:
        arr.append(audio_data)
    if face_nn_data is not None:
        arr.append(face_nn_data)
    if kinect_data is not None:
        arr.append(kinect_data)
    arr.append(labels)
        
    #find max sampled array and to further use it for other array extending
    max_len_arr = max(arr, key=lambda x:len(x))
    max_len = len(max_len_arr)
    
    #replace data with None is data source is missing
    if eyes_data is None:
        eyes_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, eyes_len), None)))
    if audio_data is None:
        audio_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, audio_len), None)))
    if face_nn_data is None:
        face_nn_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, face_len), None)))
    if kinect_data is None:
        kinect_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, kinect_len), None)))
        
    #extend data so every array has equal roq count
    eyes_data, _ = extend(eyes_data, max_len_arr)
    audio_data, _ = extend(audio_data, max_len_arr)
    face_nn_data, _ = extend(face_nn_data, max_len_arr)
    kinect_data, _ = extend(kinect_data, max_len_arr)

    whole = np.hstack((eyes_data, kinect_data[:, 1:], audio_data[:, 1:], face_nn_data[:, 1:], labels[: , 1:]))
    
    #add to whole dataset object
    if train is None:
        train = whole
    else:
        train = np.vstack((train, whole))
        
train = pd.DataFrame(train)


  0%|          | 0/306 [00:00<?, ?it/s][A
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 306/306 [04:31<00:00,  1.13it/s]


### Now, we will clean our data to keep only fully filled rows in df_clean

In [17]:
df_clean = train.dropna(how='any', axis=0)
df_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,161,162,163,164,165,166,167,168,169,170
437,6.462,1,1,0.660377,0.730769,-0.54717,0.942308,0.31857,0.240584,0.221185,...,-0.00279599,0.0210605,-0.0270551,0.0323795,-0.0292688,-0.0133435,-0.0140566,0.0217844,-0.0223791,4
438,6.442,1,0,0.679245,0.730769,-0.54717,0.942308,0.31857,0.240584,0.221185,...,-0.0115064,0.0305696,-0.0112508,0.0440542,-0.0245367,-0.0116037,-0.0133617,0.0109437,-0.0194671,4
439,6.482,1,0,0.679245,0.75,-0.54717,0.923077,0.240463,0.319368,0.249078,...,-0.0072992,0.0290698,-0.0353791,0.0279187,-0.0274823,-0.0195662,-0.0149998,0.0190567,-0.0165976,4
440,6.462,1,1,0.660377,0.730769,-0.54717,0.942308,0.283405,0.290645,0.226207,...,-0.00279599,0.0210605,-0.0270551,0.0323795,-0.0292688,-0.0133435,-0.0140566,0.0217844,-0.0223791,4
441,6.502,1,1,0.679245,0.75,-0.54717,0.942308,0.283405,0.290645,0.226207,...,-0.0188422,0.0296961,-0.0358609,0.021786,-0.034036,-0.0147312,-0.0142918,0.0169755,-0.0184577,4


### Now, we will train BayesianGaussianMixture models for every class + imputer to fill empty values again for every class

In [82]:
gmms = []
imputers = []
for i in range(0,6):
    print("Started training ", i, " emotion")
    start = time.time()
    gmm = mixture.BayesianGaussianMixture(random_state=42)
    data = df_clean.loc[df_clean[170] == i].drop([0, 170], axis=1).values
    sample = len(data)#vary this param to learn only on some random part of data
    data = data[random.sample(range(0, len(data)), sample)]
    gmm.fit(data)
    gmms.append(gmm)
    imputer = preprocessing.Imputer(strategy="mean")
    imputer.fit(data)
    imputers.append(imputer)
    print("Training took ", time.time() - start)

Started training  0  emotion
Training took  1.3394041061401367
Started training  1  emotion
Training took  1.1151819229125977
Started training  2  emotion
Training took  0.6857340335845947
Started training  3  emotion
Training took  2.044170618057251
Started training  4  emotion
Training took  1.728360891342163
Started training  5  emotion
Training took  0.5480673313140869


### Check models on data from df_clean.

In [83]:
from tqdm import tqdm

X = []
y_pred = []
y_true = []
for i in range(0,6):
    data = df_clean.loc[df_clean[170] == i].drop(0, axis=1)
    data = data.values[random.sample(range(0, len(data)), 10000)]
    for row in data:
        X.append(row[:-1])
        y_true.append(row[-1])

for item in tqdm(X):
    y_pred.append(max([(i, gmm.score(item.reshape(1, -1))) for i,gmm in enumerate(gmms)], key=lambda x: x[1])[0])

100%|██████████| 6000/6000 [00:10<00:00, 586.90it/s]


In [84]:
metrics.precision_recall_fscore_support(y_true, y_pred)

(array([ 0.83349562,  0.9437751 ,  0.94779116,  0.91650485,  0.88865979,
         0.9775739 ]),
 array([ 0.856,  0.94 ,  0.944,  0.944,  0.862,  0.959]),
 array([ 0.84459793,  0.94188377,  0.94589178,  0.93004926,  0.8751269 ,
         0.96819788]),
 array([1000, 1000, 1000, 1000, 1000, 1000]))

### Check models in couple with imputers on part of data from whole dataset

In [85]:
X = []
y_pred = []
y_true = []
for i in range(0,6):
    data = train.loc[train[170] == i].drop(0, axis=1)
    data = data.values[random.sample(range(0, len(data)), 10000)]
    for row in data:
        X.append(row[:-1])
        y_true.append(row[-1])
        
for item in tqdm(X):
    datas = []
    scores = []
    for im in imputers:
        datas.append(im.transform(item.reshape(1, -1)))
    for row in datas:
        scores.append(max([(i, gmm.score(row.reshape(1, -1))) for i,gmm in enumerate(gmms)], key=lambda x: x[1]))
    
    y_pred.append(max(scores, key=lambda x: x[1])[0])

100%|██████████| 60000/60000 [10:23<00:00, 96.29it/s]


In [86]:
metrics.accuracy_score(y_true, y_pred)
metrics.precision_recall_fscore_support(y_true, y_pred)

(array([ 0.60023481,  0.65299777,  0.79411364,  0.75646552,  0.62920824,
         0.61651479]),
 array([ 0.6135,  0.672 ,  0.5828,  0.8073,  0.6747,  0.663 ]),
 array([ 0.60679492,  0.66236262,  0.67224177,  0.7810565 ,  0.65116055,
         0.63891298]),
 array([10000, 10000, 10000, 10000, 10000, 10000]))

### Dump the model

In [45]:
with open(os.path.join(models_save_folder, "gmms"), "wb") as f:
    pickle.dump(gmms, f)
    
with open(os.path.join(models_save_folder, "imputers"), "wb") as f:
    pickle.dump(imputers, f)

### Now, make predictions on test set. This is done in the same manner as per training set. 
Refactor needed - this code duplicates train code

In [61]:
from numpy import genfromtxt
eyes_len = 6
kinect_len = 27
audio_len = 36
face_len = 100

results = []
files = []
for item in os.listdir(os.path.join(test_data, "eyes")):
    files.append(item)
for item in os.listdir(os.path.join(test_data, "kinect")):
    files.append(item)
for item in os.listdir(os.path.join(test_data, "audio")):
    files.append(item)
for item in os.listdir(os.path.join(test_data, "face_nn")):
    files.append(item)
    
files = list(set(files))

for file in tqdm(files):    
    predictions_file = os.path.join(predictions_data, file)
    if not os.path.exists(predictions_file):
        continue
        
    eyes_path = os.path.join(test_data, "eyes", file)
    kinect_path = os.path.join(test_data, "kinect", file)
    audio_path = os.path.join(test_data, "audio", file)
    face_nn_path = os.path.join(test_data, "face_nn", file)
    
    eyes_data = get_data_from_file(eyes_path)
    audio_data = get_data_from_file(audio_path)
    face_nn_data = get_data_from_file(face_nn_path)
    kinect_data = get_data_from_file(kinect_path)
    predictions = genfromtxt(predictions_file, delimiter=',', skip_header=True)[:, 0:7]
    arr = []
    
    if eyes_data is not None:
        arr.append(eyes_data)
    if audio_data is not None:
        arr.append(audio_data)
    if face_nn_data is not None:
        arr.append(face_nn_data)
    if kinect_data is not None:
        arr.append(kinect_data)
    arr.append(predictions)    
    
    max_len_arr = max(arr, key=lambda x:len(x))
    max_len = len(max_len_arr)
    
    if eyes_data is None:
        eyes_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, eyes_len), None)))
    if audio_data is None:
        audio_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, audio_len), None)))
    if face_nn_data is None:
        face_nn_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, face_len), None)))
    if kinect_data is None:
        kinect_data = np.hstack((max_len_arr[:, 0].reshape(max_len, 1), np.full((max_len, kinect_len), None)))
        
    eyes_data, _ = extend(eyes_data, max_len_arr)
    audio_data, _ = extend(audio_data, max_len_arr)
    face_nn_data, _ = extend(face_nn_data, max_len_arr)
    kinect_data, _ = extend(kinect_data, max_len_arr)

    whole = np.hstack((eyes_data, kinect_data[:, 1:], audio_data[:, 1:], face_nn_data[:, 1:]))
    
    pred = []
    for item in whole[:, 1:]:
        datas = []
        scores = []
        for im in imputers:
            datas.append(im.transform(item.reshape(1, -1)))
        for row in datas:
            scores.append(max([(i, gmm.score(row.reshape(1, -1))) for i,gmm in enumerate(gmms)], key=lambda x: x[1]))

        pred.append(max(scores, key=lambda x: x[1])[0])
    results.append([file, pred])


  0%|          | 0/141 [00:00<?, ?it/s][A
Exception in thread Thread-13:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 141/141 [1:38:30<00:00, 41.92s/it]


In [62]:
with open(preds_save_path, "wb") as f:
    pickle.dump(results, f)    

### Save results to corresponding files

In [64]:
for item in results:
    file = item[0]
    preds = item[1]
    path=os.path.join(predictions_data, file)
    path_save=os.path.join(predictions_save_folder, file)
    r = csv.reader(open(path))
    lines = [l for l in r]
    
    for i, line in enumerate(lines[1:]):
        line[1:7] = [0,0,0,0,0,0]
        line[preds[i] + 1] = str(1)
        line[0] = "{0:.2f}".format(float(line[0]))
        
    writer = csv.writer(open(path_save, 'w'))
    writer.writerows(lines)

### Validate that predictions can pass submission

In [67]:
for item in results:
    file = item[0]
    preds = item[1]
    path=os.path.join(predictions_save_folder, file)
    path_truth=os.path.join(predictions_data, file)
    r = csv.reader(open(path))
    lines = [l for l in r]
    r = csv.reader(open(path_truth))
    lines_true = [l for l in r]
    
    # all rows are predicted and not missed
    if(len(lines) != len(lines_true)):
        print(1, file, len(lines), len(lines_true))
    
    # all rows have timestamps + emotions columns
    for item in lines:
        if(len(item) != 7):
            print(2, file)
            
    # rows has exactly 1 emotion set
    for item in lines[1:]:
        if(sum([int(i) for i in item[1:7]]) != 1):
            print(3, file)
            
    # timestamps are not messed up
    time = np.array(lines)[1:, 0]   
    for i in range(0, len(time) - 2):
        if "{0:.2f}".format(float(time[i+1]) - float(time[i])) != "0.01":
            print(i, file)
            break
            
            
            

1 id4ac8ae09.csv 3065 3280
