# Configuration

NOTES: The warnings after the import are referred to the fact that Tensorflow 2.x versions are built to directly look for a GPU in the system. The warning can be forgot if you are not going to use the GPU. 

NOTE: Differently from experiments 1-4, here we have the 'mean' and 'std' of mfccs in **compute_mfccs** function

In [1]:
!source myenv/bin/activate

In [2]:
# samples in 5 seconds of audio, 16 KHz sample rate 
LENGTH_CHOSEN =  80000

In [3]:
import os
import librosa
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
sns.set_style('whitegrid')
import IPython.display as ipd
import librosa.display
import numpy as np
import pickle
import scipy
import ipywidgets
import math
from time import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold


from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv2D, AveragePooling1D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import SGD, Adam 
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers

# from livelossplot import PlotLossesKeras
tf.config.list_physical_devices('GPU')

[]

In [4]:
# import SVM
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC 

In [5]:
# set reproducibility 
seed = 7
np.random.seed(seed)

# Utils

In [6]:
def load_files(df):
    X = []
    for i in tqdm(df['path']): 
        X.append(librosa.load(i, res_type='kaiser_fast', sr=16000))
    return X

def extract_samples(X): 
    samples = []
    for ind,i in enumerate(X):
        samples.append(i[0])
    return samples 

def extract_labels(df): 
    labels = df['emotion_label'].copy()
    return labels 

def compute_lengths(samples): 
    lengths = [len(x) for x in samples]
    return lengths 

def check_outliers(lengths):
    # outliers
    lengths = np.array(lengths)
    print((lengths > 300000).sum())
    new_lengths = lengths[lengths < 300000]
    return new_lengths 

def compute_mean_length(lengths): 
    return lengths.mean()

def cut_and_pad(samples, labels, length_chosen = LENGTH_CHOSEN): 
    X_new = []
    y_new = []
    count = 0 
    for ind,i in enumerate(samples):
        if i.shape[0] < 300000:
            if i.shape[0] > length_chosen:
                new = i[:length_chosen]
                X_new.append(new)
            elif i.shape[0] < length_chosen:
                new = np.pad(i,math.ceil((length_chosen-i.shape[0])/2), mode='median')
                X_new.append(new)
            else:
                X_new.append(i)
            y_new.append(labels[count])
        count+=1
    
    return X_new, y_new

def compute_energy(samples): 
    energy_mean = []
    for i in tqdm(samples):
        energy = librosa.feature.rms(i)
        energy = energy.T 
        energy = np.array(energy)
        energy_mean.append(np.mean(energy, axis=0)) 
    return energy_mean

def compute_mfccs(samples, n_mfcc): 
    mfccs = []
    for i in tqdm(samples):
        mfcc = librosa.feature.mfcc(y=i, sr=16000, n_mfcc=n_mfcc)
        mfcc = mfcc.T
        mfcc = np.array(mfcc)
        #mfccs.append(mfcc[:, 1:]) # get rid of the first component 
        mfccs.append(np.mean(mfcc[:, 1:], axis = 0))
    mfccs = np.array(mfccs)
    return mfccs

def feature_extractor(df_train, df_val, df_test, n_mfcc): 
    load_train = load_files(df_train)
    samples_train = extract_samples(load_train)
    labels_train = extract_labels(df_train)
    samples_train, labels_train = cut_and_pad(samples_train, labels_train)
    samples_train = np.array(samples_train)
    labels_train = np.array(labels_train)
    mfccs_train = compute_mfccs(samples_train, n_mfcc = n_mfcc)
    # energy 
    energy_train = compute_energy(samples_train) 
    features_train = []
    for i in range(len(mfccs_train)): 
        if len(mfccs_train) == len(energy_train): 
            conc = np.concatenate((mfccs_train[i], energy_train[i]), axis = None)
            features_train.append(conc)

    
    
  
    load_val = load_files(df_val)
    samples_val = extract_samples(load_val)
    labels_val = extract_labels(df_val)
    samples_val, labels_val = cut_and_pad(samples_val, labels_val)
    samples_val = np.array(samples_val)
    labels_val = np.array(labels_val)
    mfccs_val = compute_mfccs(samples_val, n_mfcc = n_mfcc)
    # energy 
    energy_val = compute_energy(samples_val) 
    features_val = []
    for i in range(len(mfccs_val)): 
        if len(mfccs_val) == len(energy_val): 
            conc = np.concatenate((mfccs_val[i], energy_val[i]), axis = None)
            features_val.append(conc)
    
    
    
    
    load_test = load_files(df_test)
    samples_test = extract_samples(load_test)
    labels_test = extract_labels(df_test)
    samples_test, labels_test = cut_and_pad(samples_test, labels_test)
    samples_test = np.array(samples_test)
    labels_test = np.array(labels_test)
    mfccs_test = compute_mfccs(samples_test, n_mfcc = n_mfcc)
    # energy 
    energy_test = compute_energy(samples_test) 
    features_test=[]
    for i in range(len(mfccs_test)): 
        if len(mfccs_test) == len(energy_test): 
            conc = np.concatenate((mfccs_test[i], energy_test[i]), axis = None)
            features_test.append(conc)
    

    return np.array(features_train), labels_train,  np.array(features_val), labels_val, np.array(features_test), labels_test
    

def feature_extractor_tess(df_train,  df_test, n_mfcc): 
    # we do not have the validation set here 
    load_train = load_files(df_train)
    samples_train = extract_samples(load_train)
    labels_train = extract_labels(df_train)
    samples_train, labels_train = cut_and_pad(samples_train, labels_train)
    samples_train = np.array(samples_train)
    labels_train = np.array(labels_train)
    mfccs_train = compute_mfccs(samples_train, n_mfcc = n_mfcc)
    # energy 
    energy_train = compute_energy(samples_train) 
    features_train = []
    for i in range(len(mfccs_train)): 
        if len(mfccs_train) == len(energy_train): 
            conc = np.concatenate((mfccs_train[i], energy_train[i]), axis = None)
            features_train.append(conc)

    
    
    load_test = load_files(df_test)
    samples_test = extract_samples(load_test)
    labels_test = extract_labels(df_test)
    samples_test, labels_test = cut_and_pad(samples_test, labels_test)
    samples_test = np.array(samples_test)
    labels_test = np.array(labels_test)
    mfccs_test = compute_mfccs(samples_test, n_mfcc = n_mfcc)
    # energy 
    energy_test = compute_energy(samples_test) 
    features_test=[]
    for i in range(len(mfccs_test)): 
        if len(mfccs_test) == len(energy_test): 
            conc = np.concatenate((mfccs_test[i], energy_test[i]), axis = None)
            features_test.append(conc)
    
    
    return np.array(features_train), labels_train, np.array(features_test), labels_test
    
def encode_labels(labels_train, labels_val, labels_test): 
    
    emotion_enc = {'fear':1, 'disgust':1, 'neutral':0, 'calm':0,  'happy':0, 'sadness':1, 'surprise':0, 'angry':1}
    y_train = pd.Series(labels_train).replace(emotion_enc)
  
    y_test = pd.Series(labels_test).map(emotion_enc)
    y_val = pd.Series(labels_val).map(emotion_enc)
    return y_train, y_val, y_test 


def encode_labels_tess(labels_train, labels_test): 
    
    emotion_enc = {'fear':1, 'disgust':1, 'neutral':0, 'calm':0,  'happy':0, 'sadness':1, 'surprise':0, 'angry':1}
    y_train = pd.Series(labels_train).replace(emotion_enc)
  
    y_test = pd.Series(labels_test).map(emotion_enc)
    return y_train, y_test
    
def standard_scaling(X_train, X_val, X_test): 
  
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
    X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)
    return X_train, X_val, X_test 
    
def standard_scaling_tess(X_train, X_test): 
  
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)
    return X_train, X_test   
    


# Compute dataframes for datasets and split in Train, Val, Test 

In [7]:
#main_path = '/media/helemanc/OS/Users/i2CAT/Desktop/Datasets SER/'
main_path = '/Users/helemanc/Documents/MasterAI/THESIS/Datasets SER'
TESS = os.path.join(main_path, "tess/TESS Toronto emotional speech set data/") 
RAV = os.path.join(main_path, "ravdess-emotional-speech-audio/audio_speech_actors_01-24")
SAVEE = os.path.join(main_path, "savee/ALL/")
CREMA = os.path.join(main_path, "creamd/AudioWAV/")

## RADVESS

In [8]:
lst = []
emotion = []
voc_channel = []
full_path = []
modality = []
intensity = []
actors = []
phrase =[]

for root, dirs, files in tqdm(os.walk(RAV)):
    for file in files:
        try:
            #Load librosa array, obtain mfcss, store the file and the mfcss information in a new array
            # X, sample_rate = librosa.load(os.path.join(root,file), res_type='kaiser_fast')
            # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) 
            # The instruction below converts the labels (from 1 to 8) to a series from 0 to 7
            # This is because our predictor needs to start from 0 otherwise it will try to predict also 0.
           
            modal = int(file[1:2])
            vchan = int(file[4:5])
            lab = int(file[7:8])
            ints = int(file[10:11])
            phr = int(file[13:14])
            act = int(file[18:20])
            # arr = mfccs, lab
            # lst.append(arr)
            
            modality.append(modal)
            voc_channel.append(vchan)
            emotion.append(lab) #only labels
            intensity.append(ints)
            phrase.append(phr)
            actors.append(act)
            
            full_path.append((root, file)) # only files
          # If the file is not valid, skip it
        except ValueError:
            continue

25it [00:00, 148.77it/s]


In [9]:
# 01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised
# merge neutral and calm
emotions_list = ['neutral', 'neutral', 'happy', 'sadness', 'angry', 'fear', 'disgust', 'surprise']
emotion_dict = {em[0]+1:em[1] for em in enumerate(emotions_list)}

df = pd.DataFrame([emotion, voc_channel, modality, intensity, actors, actors,phrase, full_path]).T
df.columns = ['emotion', 'voc_channel', 'modality', 'intensity', 'actors', 'gender', 'phrase', 'path']
df['emotion'] = df['emotion'].map(emotion_dict)
df['voc_channel'] = df['voc_channel'].map({1: 'speech', 2:'song'})
df['modality'] = df['modality'].map({1: 'full AV', 2:'video only', 3:'audio only'})
df['intensity'] = df['intensity'].map({1: 'normal', 2:'strong'})
df['actors'] = df['actors']
df['gender'] = df['actors'].apply(lambda x: 'female' if x%2 == 0 else 'male')
df['phrase'] = df['phrase'].map({1: 'Kids are talking by the door', 2:'Dogs are sitting by the door'})
df['path'] = df['path'].apply(lambda x: x[0] + '/' + x[1])

In [10]:
# remove files with noise to apply the same noise to all files for data augmentation 
df = df[~df.path.str.contains('noise')]

In [11]:
df.head()

Unnamed: 0,emotion,voc_channel,modality,intensity,actors,gender,phrase,path
0,angry,speech,audio only,normal,16,female,Dogs are sitting by the door,/Users/helemanc/Documents/MasterAI/THESIS/Data...
1,fear,speech,audio only,normal,16,female,Dogs are sitting by the door,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2,fear,speech,audio only,strong,16,female,Kids are talking by the door,/Users/helemanc/Documents/MasterAI/THESIS/Data...
3,angry,speech,audio only,strong,16,female,Kids are talking by the door,/Users/helemanc/Documents/MasterAI/THESIS/Data...
5,disgust,speech,audio only,normal,16,female,Kids are talking by the door,/Users/helemanc/Documents/MasterAI/THESIS/Data...


In [12]:
# only speech
RAV_df = df
RAV_df = RAV_df.loc[RAV_df.voc_channel == 'speech']

In [13]:
RAV_df.insert(0, "emotion_label", RAV_df.emotion, True)

In [14]:
RAV_df = RAV_df.drop(['emotion', 'voc_channel', 'modality', 'intensity', 'phrase'], 1)

In [15]:
RAV_df

Unnamed: 0,emotion_label,actors,gender,path
0,angry,16,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
1,fear,16,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2,fear,16,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
3,angry,16,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
5,disgust,16,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
...,...,...,...,...
2869,happy,8,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2871,happy,8,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2872,neutral,8,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2873,neutral,8,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...


In [16]:
RAV_train = []
RAV_val = []
RAV_test = []

In [17]:
for index, row in RAV_df.iterrows():
    if row['actors'] in range(1,21): 
        RAV_train.append(row) 
    elif row['actors'] in range(21,23): 
        RAV_val.append(row)
    elif row['actors'] in range(23,25): 
        RAV_test.append(row)
len(RAV_train), len(RAV_val), len(RAV_test)

(1200, 120, 120)

In [18]:
RAV_train = pd.DataFrame(RAV_train)
RAV_val = pd.DataFrame(RAV_val)
RAV_test = pd.DataFrame(RAV_test)

In [19]:
RAV_train = RAV_train.drop(['actors'], 1)
RAV_val = RAV_val.drop(['actors'], 1)
RAV_test = RAV_test.drop(['actors'], 1)

In [20]:
RAV_train.reset_index(drop=True, inplace = True) 
RAV_val.reset_index(drop=True, inplace = True) 
RAV_test.reset_index(drop=True, inplace = True ) 

## SAVEE

In [21]:
# Get the data location for SAVEE
dir_list = os.listdir(SAVEE)

# parse the filename to get the emotions
emotion=[]
path = []
actors = []
gender = []
for i in dir_list:
    actors.append(i[:2])
    if i[-8:-6]=='_a':
        emotion.append('angry')
        gender.append('male')
    elif i[-8:-6]=='_d':
        emotion.append('disgust')
        gender.append('male')
    elif i[-8:-6]=='_f':
        emotion.append('fear')
        gender.append('male')
    elif i[-8:-6]=='_h':
        emotion.append('happy')
        gender.append('male')
    elif i[-8:-6]=='_n':
        emotion.append('neutral')
        gender.append('male')
    elif i[-8:-6]=='sa':
        emotion.append('sadness')
        gender.append('male')
    elif i[-8:-6]=='su':
        emotion.append('surprise')
        gender.append('male') 
    else:
        emotion.append('Unknown') 
    path.append(SAVEE + i)
    
# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['emotion_label'])
                      
SAVEE_df = pd.concat([SAVEE_df,
                      pd.DataFrame(actors, columns = ['actors']),
                      pd.DataFrame(gender, columns = ['gender']), 
                      pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.emotion_label.value_counts()

neutral     120
surprise     60
disgust      60
sadness      60
angry        60
fear         60
happy        60
Name: emotion_label, dtype: int64

In [22]:
SAVEE_df.head()

Unnamed: 0,emotion_label,actors,gender,path
0,sadness,JK,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
1,sadness,JK,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2,neutral,DC,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
3,surprise,DC,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
4,neutral,DC,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...


In [23]:
SAVEE_train = []
SAVEE_val = []
SAVEE_test = []

In [24]:
#DC, JE, JK, KL
for index, row in SAVEE_df.iterrows(): 
    if row['actors'] == 'DC' or row ['actors'] == 'JE':
        SAVEE_train.append(row)
    elif row['actors'] == 'JK': 
        SAVEE_val.append(row)
    else: 
        SAVEE_test.append(row)
len(SAVEE_train), len(SAVEE_val), len(SAVEE_test)

(240, 120, 120)

In [25]:
SAVEE_train = pd.DataFrame(SAVEE_train)
SAVEE_val = pd.DataFrame(SAVEE_val)
SAVEE_test = pd.DataFrame(SAVEE_test)

In [26]:
SAVEE_train = SAVEE_train.drop(['actors'], 1)
SAVEE_val = SAVEE_val.drop(['actors'], 1)
SAVEE_test = SAVEE_test.drop(['actors'], 1)

In [27]:
SAVEE_train = SAVEE_train.reset_index(drop=True) 
SAVEE_val = SAVEE_val.reset_index(drop=True) 
SAVEE_test = SAVEE_test.reset_index(drop=True) 

## TESS

In [28]:
dir_list = os.listdir(TESS)
dir_list.sort()
dir_list

path = []
emotion = []
gender = []
actors = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry':
            emotion.append('angry')
            gender.append('female')
            actors.append('OAF')
        elif i == 'YAF_angry': 
            emotion.append('angry')
            gender.append('female')
            actors.append('YAF')
            
            
        elif i == 'OAF_disgust' :
            emotion.append('disgust')
            gender.append('female')
            actors.append('OAF')
        elif i == 'YAF_disgust': 
            emotion.append('disgust')
            gender.append('female')
            actors.append('YAF')
            
            
        elif i == 'OAF_Fear':
            emotion.append('fear')
            gender.append('female')
            actors.append('OAF')
        elif i == 'YAF_fear': 
            emotion.append('fear')
            gender.append('female')
            actors.append('YAF') 
            
            
        elif i == 'OAF_happy' :
            emotion.append('happy')
            gender.append('female')
            actors.append('OAF')
        elif i == 'YAF_happy': 
            emotion.append('angry')
            gender.append('female')
            actors.append('YAF')            
            
        elif i == 'OAF_neutral':
            emotion.append('neutral')
            gender.append('female')
            actors.append('OAF')   
        elif i == 'YAF_neutral': 
            emotion.append('neutral')
            gender.append('female')
            actors.append('YAF')      
            
                
        elif i == 'OAF_Pleasant_surprise':
            emotion.append('surprise')
            gender.append('female')
            actors.append('OAF')
        
        elif i == 'YAF_pleasant_surprised': 
            emotion.append('surprise')
            gender.append('female')
            actors.append('YAF')            
            
        elif i == 'OAF_Sad':
            emotion.append('sadness')
            gender.append('female')
            actors.append('OAF')
        elif i == 'YAF_sad': 
            emotion.append('sadness')
            gender.append('female')
            actors.append('YAF')            
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['emotion_label'])
TESS_df = pd.concat([TESS_df, pd.DataFrame(gender, columns = ['gender']), 
                     pd.DataFrame(actors, columns= ['actors']),
                     pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.emotion_label.value_counts()

angry       1200
sadness      800
neutral      800
disgust      800
surprise     800
fear         800
happy        400
Name: emotion_label, dtype: int64

In [29]:
TESS_df= TESS_df[~TESS_df.path.str.contains('noise')]

In [30]:
TESS_train = []
TESS_test = []

In [31]:
for index, row in TESS_df.iterrows(): 
    if row['actors'] == 'YAF': 
        TESS_train.append(row)
    else: 
        TESS_test.append(row)
len(TESS_train), len(TESS_test)

(1400, 1400)

In [32]:
TESS_train = pd.DataFrame(TESS_train)
TESS_test = pd.DataFrame(TESS_test)

In [33]:
TESS_train = TESS_train.reset_index(drop=True) 
TESS_test  = TESS_test.reset_index(drop=True) 

## CREMA-D

In [34]:
males = [1,
5,
11,
14,
15,
16,
17,
19,
22,
23,
26,
27,
31,
32,
33,
34,
35,
36,
38,
39,
41,
42,
44,
45,
48,
50,
51,
57,
59, 
62, 
64,
65, 
66,
67,
68,
69,
70,
71,
77, 
80, 
81, 
83, 
85, 
86, 
87,
88, 
90]

In [35]:
females = [ 2,
3,
4,
6,
7,
8,
9,
10,
12,
13,
18,
20,
21,
24,
25,
28,
29,
30,
37,
40,
43,
46,
47,
49,
52,
53,
54,
55,
56, 
58, 
60,
61,
63,
72, 
73, 
74, 
75, 
76, 
78, 
79, 
82, 
84, 
89, 
91]

In [36]:
crema_directory_list = os.listdir(CREMA)

file_emotion = []
file_path = []
actors = []
gender = []




for file in crema_directory_list:

    # storing file emotions
    part=file.split('_')
    
    # use only high intensity files
    if "HI" in part[3] :
        actor = part[0][2:]
        actors.append(actor)
        if int(actor) in males:
            gender.append('male')
        else: 
            gender.append('female')
    
        # storing file paths
        file_path.append(CREMA + file)
        if part[2] == 'SAD':
            file_emotion.append('sadness')
        elif part[2] == 'ANG':
            file_emotion.append('angry')
        elif part[2] == 'DIS':
            file_emotion.append('disgust')
        elif part[2] == 'FEA':
            file_emotion.append('fear')
        elif part[2] == 'HAP':
            file_emotion.append('happy')
        elif part[2] == 'NEU':
            file_emotion.append('neutral')
        else:
            file_emotion.append('Unknown')

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['emotion_label'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['path'])
actors_df = pd.DataFrame(actors, columns=['actors'])
gender_df = pd.DataFrame(gender, columns=['gender'])                      
Crema_df = pd.concat([emotion_df, actors_df, gender_df, path_df], axis=1)
Crema_df.head()

Unnamed: 0,emotion_label,actors,gender,path
0,sadness,28,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
1,angry,48,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2,disgust,27,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
3,disgust,32,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...
4,happy,80,male,/Users/helemanc/Documents/MasterAI/THESIS/Data...


In [37]:
Crema_df.shape

(455, 4)

In [38]:
actor_files = {}

for index, row in Crema_df.iterrows():
    actor = row['actors']
    if actor not in actor_files.keys(): 
        actor_files[actor] = 1
    else: 
        actor_files[actor]+=1

In [39]:
actor_files

{'28': 5,
 '48': 5,
 '27': 5,
 '32': 5,
 '80': 5,
 '59': 5,
 '39': 5,
 '43': 5,
 '56': 5,
 '31': 5,
 '24': 5,
 '51': 5,
 '44': 5,
 '77': 5,
 '62': 5,
 '88': 5,
 '55': 5,
 '40': 5,
 '13': 5,
 '06': 5,
 '35': 5,
 '20': 5,
 '87': 5,
 '09': 5,
 '78': 5,
 '65': 5,
 '70': 5,
 '23': 5,
 '36': 5,
 '01': 5,
 '14': 5,
 '47': 5,
 '52': 5,
 '58': 5,
 '37': 5,
 '22': 5,
 '71': 5,
 '64': 5,
 '50': 5,
 '45': 5,
 '57': 5,
 '42': 5,
 '29': 5,
 '53': 5,
 '46': 5,
 '33': 5,
 '26': 5,
 '34': 5,
 '21': 5,
 '15': 5,
 '08': 5,
 '86': 5,
 '79': 5,
 '63': 5,
 '76': 5,
 '25': 5,
 '30': 5,
 '07': 5,
 '12': 5,
 '41': 5,
 '54': 5,
 '89': 5,
 '81': 5,
 '49': 5,
 '38': 5,
 '05': 5,
 '10': 5,
 '61': 5,
 '74': 5,
 '69': 5,
 '18': 5,
 '83': 5,
 '17': 5,
 '02': 5,
 '73': 5,
 '66': 5,
 '84': 5,
 '91': 5,
 '90': 5,
 '85': 5,
 '03': 5,
 '16': 5,
 '67': 5,
 '72': 5,
 '68': 5,
 '82': 5,
 '19': 5,
 '11': 5,
 '04': 5,
 '75': 5,
 '60': 5}

In [40]:
count_males = 0 
count_females = 0 
male_list = []
for index, row in Crema_df.iterrows(): 
    gender = row['gender']
    actor = row['actors']
    if gender == 'male':
        count_males +=1
        if actor not in male_list: 
            male_list.append(actor)
    else: 
        count_females +=1

In [41]:
count_males, count_females

(235, 220)

Since there are more males than females we will remove randomly 3 male actors (since there are exactly 5 audio files per actor)

In [42]:
import random 
'''
random.seed(42)
males_to_remove = random.sample(male_list, 3)
males_to_remove
'''
males_to_remove = ['17', '80', '88']

In [43]:
new_df = []
for index, row in Crema_df.iterrows(): 
    if row['actors'] not in males_to_remove: 
        new_df.append(row)

In [44]:
CREMA_df = pd.DataFrame(new_df)

In [45]:
for index, row in CREMA_df.iterrows(): 
    if row['actors'] == '17': 
        print("Elements not removed")

In [46]:
count_males = 0 
count_females = 0 
male_list = []
female_list = []
for index, row in CREMA_df.iterrows(): 
    gender = row['gender']
    actor = row['actors']
    if gender == 'male':
        count_males +=1
        if actor not in male_list: 
            male_list.append(actor)
    else: 
        count_females +=1
        if actor not in female_list: 
            female_list.append(actor)

In [47]:
count_males, count_females

(220, 220)

In [48]:
len(female_list)

44

In [49]:
len(male_list)

44

In [50]:
CREMA_train = []
CREMA_val = []
CREMA_test = []

In [51]:
females_train = random.sample(female_list, 32)
males_train = random.sample(male_list, 32)

# remove the elements assigned to train 
for element in females_train:
    if element in female_list:
        female_list.remove(element)
        
for element in males_train:
    if element in male_list:
        male_list.remove(element)

         
females_val = random.sample(female_list, 6) 
males_val = random.sample(male_list, 6) 

# remove the elements assigned to val
for element in females_val:
    if element in female_list:
        female_list.remove(element)
        
for element in males_val:
    if element in male_list:
        male_list.remove(element)
        
females_test = random.sample(female_list, 6) 
males_test = random.sample(male_list, 6)        

In [52]:
females_train, males_train, females_val, males_val, females_test, males_test

(['47',
  '10',
  '78',
  '46',
  '06',
  '49',
  '75',
  '29',
  '61',
  '37',
  '18',
  '13',
  '72',
  '12',
  '03',
  '25',
  '54',
  '07',
  '76',
  '60',
  '63',
  '30',
  '82',
  '84',
  '09',
  '21',
  '24',
  '89',
  '20',
  '55',
  '74',
  '73'],
 ['90',
  '15',
  '51',
  '70',
  '16',
  '66',
  '31',
  '81',
  '69',
  '71',
  '83',
  '42',
  '39',
  '87',
  '77',
  '45',
  '62',
  '11',
  '41',
  '86',
  '65',
  '35',
  '67',
  '23',
  '27',
  '33',
  '14',
  '36',
  '38',
  '44',
  '19',
  '68'],
 ['53', '91', '43', '58', '56', '52'],
 ['34', '22', '26', '01', '85', '32'],
 ['40', '28', '04', '79', '02', '08'],
 ['48', '50', '59', '64', '57', '05'])

In [53]:
train = females_train + males_train 
val = females_val + males_val 
test = females_test + males_test

In [54]:
for index, row in CREMA_df.iterrows(): 
    gender = row['gender']
    actor = row['actors']
    if actor in train: 
        CREMA_train.append(row)
    elif actor in val: 
        CREMA_val.append(row)
    else:
        CREMA_test.append(row)

In [55]:
CREMA_train = pd.DataFrame(CREMA_train) 
CREMA_val = pd.DataFrame(CREMA_val) 
CREMA_test = pd.DataFrame(CREMA_test)

In [56]:
CREMA_train.shape, CREMA_val.shape, CREMA_test.shape

((320, 4), (60, 4), (60, 4))

In [57]:
CREMA_train = CREMA_train.reset_index(drop=True) 
CREMA_val = CREMA_val.reset_index(drop = True) 

# Random Search parameters

In [57]:
param_grid_clf = {'C': [0.1,1, 10, 100], 'kernel': ['rbf',  'linear']}
svc = SVC()

# Experiment 1.1 : RAVDESS

In [59]:
df_train = RAV_train
df_val = RAV_val
df_test = RAV_test

In [60]:
df_train.head()

Unnamed: 0,emotion_label,gender,path
0,angry,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
1,fear,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
2,fear,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
3,angry,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...
4,disgust,female,/Users/helemanc/Documents/MasterAI/THESIS/Data...


In [61]:
df_train['emotion_label'].value_counts()

neutral     240
surprise    160
sadness     160
angry       160
fear        160
happy       160
disgust     160
Name: emotion_label, dtype: int64

In [62]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [63]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13)

100%|██████████| 1200/1200 [00:05<00:00, 236.96it/s]
100%|██████████| 1200/1200 [00:09<00:00, 121.94it/s]
100%|██████████| 1200/1200 [00:00<00:00, 1322.42it/s]
100%|██████████| 120/120 [00:00<00:00, 416.51it/s]
100%|██████████| 120/120 [00:01<00:00, 113.93it/s]
100%|██████████| 120/120 [00:00<00:00, 1396.36it/s]
100%|██████████| 120/120 [00:00<00:00, 425.43it/s]
100%|██████████| 120/120 [00:01<00:00, 89.67it/s]
100%|██████████| 120/120 [00:00<00:00, 783.93it/s]


In [64]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [65]:
values, counts = np.unique(y_train, return_counts=True)

In [66]:
counts

array([560, 640])

In [62]:
np.size(y_val)

120

In [64]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [65]:
X_train.shape

(1200, 13)

In [66]:
y_train.shape

(1200,)

## Shuffle training data

In [67]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [68]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [69]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [70]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 3.776s



In [71]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [72]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 1.0714285714285714, 1: 0.9375})

## Testing

In [73]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.625


## Save best model 

In [74]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_1.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [75]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.2 : RAVDESS noise

## Read dataframes

In [76]:
preprocess_path = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/ravdess"
df_train = pd.read_csv(os.path.join(preprocess_path,"df_train.csv"))
df_val = pd.read_csv(os.path.join(preprocess_path,"df_val.csv"))
df_test = pd.read_csv(os.path.join(preprocess_path,"df_test.csv"))  

## Feature Extraction

In [77]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13) # 13

100%|█████████████████████████████████████| 2400/2400 [00:00<00:00, 2556.93it/s]
100%|███████████████████████████████████████| 2400/2400 [01:09<00:00, 34.45it/s]
100%|██████████████████████████████████████| 2400/2400 [00:03<00:00, 779.70it/s]
100%|███████████████████████████████████████| 120/120 [00:00<00:00, 1431.44it/s]
100%|█████████████████████████████████████████| 120/120 [00:03<00:00, 35.94it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 790.88it/s]
100%|███████████████████████████████████████| 120/120 [00:00<00:00, 1097.22it/s]
100%|█████████████████████████████████████████| 120/120 [00:03<00:00, 31.29it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 577.51it/s]


In [78]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [79]:
np.size(y_val)

120

In [80]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [81]:
X_train.shape

(2400, 13)

## Shuffle training data

In [82]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [83]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [84]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [85]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 7.774s



In [86]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [87]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 1.0714285714285714, 1: 0.9375})

## Testing

In [88]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.6083333333333333


## Save best model 

In [89]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_2.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [90]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.3: TESS

In [67]:
df_train = TESS_train
df_test = TESS_test

In [70]:
df_train['emotion_label'].value_counts()

angry       400
surprise    200
sadness     200
fear        200
neutral     200
disgust     200
Name: emotion_label, dtype: int64

In [92]:
df_train.reset_index(drop = True, inplace = True) 
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [71]:
X_train, y_train, X_test, y_test = feature_extractor_tess(df_train,  df_test, 13)

100%|██████████| 1400/1400 [00:23<00:00, 60.11it/s]
100%|██████████| 1400/1400 [00:15<00:00, 92.85it/s] 
100%|██████████| 1400/1400 [00:00<00:00, 1450.61it/s]
100%|██████████| 1400/1400 [00:02<00:00, 541.49it/s]
100%|██████████| 1400/1400 [00:14<00:00, 95.95it/s] 
100%|██████████| 1400/1400 [00:00<00:00, 1404.50it/s]


In [72]:
y_train, y_test = encode_labels_tess(y_train, y_test)

In [73]:
values, counts = np.unique(y_train, return_counts=True)

In [74]:
counts

array([ 400, 1000])

In [95]:
np.size(y_test)

1400

In [96]:
X_train, X_test = standard_scaling_tess(X_train,  X_test)

In [97]:
X_train.shape

(1400, 13)

## Shuffle training data

In [98]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [99]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [100]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [101]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 0.848s



In [102]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [103]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 1.75, 1: 0.7})

## Testing

In [104]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5564285714285714


## Save best model 

In [105]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_3.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [106]:
'''
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_3.pkl"

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

'\npkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_3.pkl"\n\n# Load from file\nwith open(pkl_filename, \'rb\') as file:\n    pickle_model = pickle.load(file)\n'

# Experiment 1.4: TESS noise

## Read dataframes

In [107]:
preprocess_path = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/tess"
df_train = pd.read_csv(os.path.join(preprocess_path,"df_train.csv"))
df_test = pd.read_csv(os.path.join(preprocess_path,"df_test.csv"))  

## Feature Extraction

In [108]:
X_train, y_train, X_test, y_test = feature_extractor_tess(df_train, df_test, 13) # 13

100%|███████████████████████████████████████| 2800/2800 [00:47<00:00, 59.41it/s]
100%|███████████████████████████████████████| 2800/2800 [01:12<00:00, 38.37it/s]
100%|█████████████████████████████████████| 2800/2800 [00:02<00:00, 1247.92it/s]
100%|█████████████████████████████████████| 1400/1400 [00:00<00:00, 1836.78it/s]
100%|███████████████████████████████████████| 1400/1400 [00:37<00:00, 37.54it/s]
100%|█████████████████████████████████████| 1400/1400 [00:01<00:00, 1255.27it/s]


In [109]:
y_train,  y_test = encode_labels_tess(y_train,  y_test)

In [110]:
np.size(y_train)

2800

In [111]:
X_train, X_test = standard_scaling_tess(X_train,  X_test)

In [112]:
X_train.shape

(2800, 13)

## Shuffle training data

In [113]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [114]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [115]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [116]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 4.405s



In [117]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [118]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 1.75, 1: 0.7})

## Testing

In [119]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5714285714285714


## Save best model 

In [120]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_4.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [121]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.5: SAVEE

In [75]:
df_train = SAVEE_train
df_val = SAVEE_val
df_test = SAVEE_test

In [76]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [124]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13)

100%|█████████████████████████████████████████| 240/240 [00:09<00:00, 24.33it/s]
100%|█████████████████████████████████████████| 240/240 [00:03<00:00, 77.98it/s]
100%|███████████████████████████████████████| 240/240 [00:00<00:00, 1250.73it/s]
100%|█████████████████████████████████████████| 120/120 [00:06<00:00, 17.93it/s]
100%|█████████████████████████████████████████| 120/120 [00:03<00:00, 38.04it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 666.43it/s]
100%|█████████████████████████████████████████| 120/120 [00:04<00:00, 26.48it/s]
100%|█████████████████████████████████████████| 120/120 [00:03<00:00, 30.58it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 629.45it/s]


In [125]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [126]:
np.size(y_val)

120

In [127]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [128]:
X_train.shape

(240, 13)

## Shuffle training data

In [129]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [130]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [131]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [132]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 0.323s



In [133]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 100}

In [134]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=100, class_weight={0: 1.0, 1: 1.0})

## Testing

In [135]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5166666666666667


## Save best model 

In [136]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_5.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [137]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.6: SAVEE noise

## Read dataframes

In [138]:
preprocess_path = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/savee"
df_train = pd.read_csv(os.path.join(preprocess_path,"df_train.csv"))
df_val = pd.read_csv(os.path.join(preprocess_path,"df_val.csv"))
df_test = pd.read_csv(os.path.join(preprocess_path,"df_test.csv"))  

## Feature Extraction

In [139]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13) # 13

100%|█████████████████████████████████████████| 480/480 [00:22<00:00, 21.74it/s]
100%|█████████████████████████████████████████| 480/480 [00:10<00:00, 45.88it/s]
100%|████████████████████████████████████████| 480/480 [00:00<00:00, 947.87it/s]
100%|█████████████████████████████████████████| 120/120 [00:05<00:00, 20.45it/s]
100%|█████████████████████████████████████████| 120/120 [00:03<00:00, 36.62it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 821.49it/s]
100%|█████████████████████████████████████████| 120/120 [00:04<00:00, 25.51it/s]
100%|█████████████████████████████████████████| 120/120 [00:02<00:00, 42.07it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 658.45it/s]


In [140]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [141]:
np.size(y_val)

120

In [142]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [143]:
X_train.shape

(480, 13)

## Shuffle training data

In [144]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [145]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [146]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [147]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 0.648s



In [148]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [149]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 1.0, 1: 1.0})

## Testing

In [150]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5


## Save best model 

In [151]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_6.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [152]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.7: CREMA 

In [77]:
df_train = CREMA_train
df_val = CREMA_val
df_test = CREMA_test

In [78]:
df_train['emotion_label'].value_counts()

disgust    64
sadness    64
angry      64
fear       64
happy      64
Name: emotion_label, dtype: int64

In [154]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [155]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13)

100%|███████████████████████████████████████| 320/320 [00:00<00:00, 1807.88it/s]
100%|█████████████████████████████████████████| 320/320 [00:09<00:00, 33.01it/s]
100%|████████████████████████████████████████| 320/320 [00:00<00:00, 932.83it/s]
100%|█████████████████████████████████████████| 60/60 [00:00<00:00, 2039.63it/s]
100%|███████████████████████████████████████████| 60/60 [00:02<00:00, 27.68it/s]
100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 540.56it/s]
100%|█████████████████████████████████████████| 60/60 [00:00<00:00, 1789.19it/s]
100%|███████████████████████████████████████████| 60/60 [00:01<00:00, 32.55it/s]
100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 483.17it/s]


In [156]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [157]:
np.size(y_val)

60

In [158]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [159]:
X_train.shape

(320, 13)

## Shuffle training data

In [160]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [161]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [162]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [163]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 0.403s



In [164]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'linear', 'C': 10}

In [165]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 2.5, 1: 0.625}, kernel='linear')

## Testing

In [166]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.7


## Save best model 

In [167]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_7.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [168]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.8: CREMA - noise

In [63]:
#preprocess_path = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/crema"
preprocess_path = "/Users/helemanc/PycharmProjects/ambient-intelligence/notebooks_binary_model/df_csv_noise/crema" 
df_train = pd.read_csv(os.path.join(preprocess_path,"df_train.csv"))
df_val = pd.read_csv(os.path.join(preprocess_path,"df_val.csv"))
df_test = pd.read_csv(os.path.join(preprocess_path,"df_test.csv"))  

## Feature Extraction

In [64]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13) # 13

100%|██████████| 640/640 [00:01<00:00, 494.62it/s]
100%|██████████| 640/640 [00:06<00:00, 91.44it/s] 
100%|██████████| 640/640 [00:00<00:00, 1453.08it/s]
100%|██████████| 60/60 [00:00<00:00, 334.44it/s]
100%|██████████| 60/60 [00:01<00:00, 59.98it/s]
100%|██████████| 60/60 [00:00<00:00, 943.63it/s]
100%|██████████| 60/60 [00:00<00:00, 387.33it/s]
100%|██████████| 60/60 [00:00<00:00, 83.50it/s]
100%|██████████| 60/60 [00:00<00:00, 997.52it/s]


In [65]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [66]:
np.size(y_val)

60

In [67]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [68]:
X_train.shape

(640, 13)

## Hypeparameter optimization

In [70]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [176]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [177]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 0.346s



In [178]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 10}

In [71]:
'''
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights, probability=True)
best_clf.fit(X_train, y_train)
'''
best_clf =SVC(C=10, kernel='rbf', class_weight=class_weights, probability=True)
best_clf.fit(X_train, y_train)

SVC(C=10, class_weight={0: 2.5, 1: 0.625}, probability=True)

## Testing

In [180]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.8333333333333334


In [90]:
print(best_clf.predict_proba(X_test)[0][1])

0.8107118558926012


In [88]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.58      0.58      0.58        12
           1       0.90      0.90      0.90        48

    accuracy                           0.83        60
   macro avg       0.74      0.74      0.74        60
weighted avg       0.83      0.83      0.83        60



## Save best model 

In [85]:
#pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_8.pkl"
pkl_filename = "/Users/helemanc/Desktop/re_run_best_svm/model_6_8.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [182]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.9: RAVDESS - TESS - SAVEE

In [233]:
df_train = pd.concat([RAV_train, SAVEE_train, TESS_train])
df_val = pd.concat([RAV_val, SAVEE_val])
df_test = pd.concat([RAV_test, SAVEE_test, TESS_test])
#df_test = pd.concat([RAV_train, SAVEE_test])

In [234]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [235]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13)

100%|██████████████████████████████████████| 2840/2840 [00:22<00:00, 124.91it/s]
100%|███████████████████████████████████████| 2840/2840 [00:39<00:00, 71.51it/s]
100%|█████████████████████████████████████| 2840/2840 [00:01<00:00, 1535.13it/s]
100%|█████████████████████████████████████████| 240/240 [00:04<00:00, 58.69it/s]
100%|█████████████████████████████████████████| 240/240 [00:03<00:00, 79.52it/s]
100%|███████████████████████████████████████| 240/240 [00:00<00:00, 1025.97it/s]
100%|██████████████████████████████████████| 1640/1640 [00:03<00:00, 434.14it/s]
100%|███████████████████████████████████████| 1640/1640 [00:22<00:00, 73.15it/s]
100%|█████████████████████████████████████| 1640/1640 [00:00<00:00, 1746.69it/s]


In [236]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [237]:
np.size(y_test)

1640

In [238]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [239]:
X_train.shape

(2840, 13)

## Shuffle training data

In [240]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [241]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [242]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [243]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 6.504s



In [244]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 100}

In [245]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=100, class_weight={0: 1.3148148148148149, 1: 0.8068181818181818})

## Testing

In [246]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5463414634146342


## Save best model 

In [247]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_9.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [248]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 2.10: RAVDESS - TESS - SAVEE noise

## Read dataframes

In [249]:
preprocess_path_rav = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/ravdess"
preprocess_path_savee = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/savee"
preprocess_path_tess = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/tess"

df_train_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_train.csv"))
df_val_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_val.csv"))
df_test_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_test.csv"))  

df_train_tess = pd.read_csv(os.path.join(preprocess_path_tess,"df_train.csv"))
df_test_tess= pd.read_csv(os.path.join(preprocess_path_tess,"df_test.csv"))  

df_train_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_train.csv"))
df_val_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_val.csv"))
df_test_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_test.csv"))  

In [250]:
df_train = pd.concat([df_train_rav, df_train_savee, df_train_tess])
df_val = pd.concat([df_val_rav, df_val_savee])
#df_test = pd.concat([df_test_rav, df_test_savee, df_test_tess])
df_test = pd.concat([RAV_train, SAVEE_test])

In [251]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Shuffle training data

In [252]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [253]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [254]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [255]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 5.314s



In [256]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 100}

In [257]:
best_clf = SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=100, class_weight={0: 1.3148148148148149, 1: 0.8068181818181818})

## Testing

In [258]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5469512195121952


## Save best model 

In [259]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_10.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [212]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.11: RAVDESS - TESS - SAVEE - CREMA

In [199]:
df_train = pd.concat([RAV_train, SAVEE_train, TESS_train, CREMA_train])
df_val = pd.concat([RAV_val, SAVEE_val, CREMA_val])
df_test = pd.concat([RAV_test, SAVEE_test, TESS_test, CREMA_test])
#df_test = pd.concat([RAV_train, SAVEE_test])

In [200]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [201]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13)

100%|███████████████████████████████████████| 3160/3160 [00:31<00:00, 99.43it/s]
100%|███████████████████████████████████████| 3160/3160 [00:58<00:00, 53.72it/s]
100%|█████████████████████████████████████| 3160/3160 [00:02<00:00, 1539.54it/s]
100%|█████████████████████████████████████████| 300/300 [00:04<00:00, 68.05it/s]
100%|█████████████████████████████████████████| 300/300 [00:05<00:00, 54.92it/s]
100%|███████████████████████████████████████| 300/300 [00:00<00:00, 1045.02it/s]
100%|██████████████████████████████████████| 1700/1700 [00:04<00:00, 378.68it/s]
100%|███████████████████████████████████████| 1700/1700 [00:29<00:00, 58.51it/s]
100%|█████████████████████████████████████| 1700/1700 [00:01<00:00, 1513.97it/s]


In [202]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [203]:
np.size(y_test)

1700

In [204]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [205]:
X_train.shape

(3160, 13)

## Shuffle training data

In [206]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [207]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [208]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [209]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 227.239s



In [210]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 100}

In [211]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=100, class_weight={0: 1.381118881118881, 1: 0.7837301587301587})

## Testing

In [212]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.5152941176470588


## Save best model 

In [213]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_11.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [214]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"

# Experiment 1.12:  RAVDESS - TESS - SAVEE - CREMA noise

## Read dataframes

In [215]:
preprocess_path_rav = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/ravdess"
preprocess_path_savee = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/savee"
preprocess_path_tess = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/tess"
preprocess_path_crema = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/crema"

df_train_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_train.csv"))
df_val_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_val.csv"))
df_test_rav = pd.read_csv(os.path.join(preprocess_path_rav,"df_test.csv"))  

df_train_tess = pd.read_csv(os.path.join(preprocess_path_tess,"df_train.csv"))
df_test_tess= pd.read_csv(os.path.join(preprocess_path_tess,"df_test.csv"))  

df_train_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_train.csv"))
df_val_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_val.csv"))
df_test_savee = pd.read_csv(os.path.join(preprocess_path_savee,"df_test.csv"))  

df_train_crema = pd.read_csv(os.path.join(preprocess_path_crema,"df_train.csv"))
df_val_crema = pd.read_csv(os.path.join(preprocess_path_crema,"df_val.csv"))
df_test_crema = pd.read_csv(os.path.join(preprocess_path_crema,"df_test.csv")) 

In [216]:
df_train = pd.concat([df_train_rav, df_train_savee, df_train_tess, df_train_crema])
df_val = pd.concat([df_val_rav, df_val_savee, df_train_crema, df_train_crema])
#df_test = pd.concat([df_test_rav, df_test_savee, df_test_tess])
#df_test = pd.concat([df_test_rav, df_test_savee, df_test_crema ])
df_test = pd.concat([df_test_rav, df_test_savee ])

In [217]:
df_train.reset_index(drop = True, inplace = True) 
df_val.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

## Feature Extraction

In [218]:
X_train, y_train, X_val, y_val, X_test, y_test = feature_extractor(df_train, df_val, df_test, 13) # 13

100%|███████████████████████████████████████| 6320/6320 [01:13<00:00, 86.21it/s]
100%|███████████████████████████████████████| 6320/6320 [02:58<00:00, 35.47it/s]
100%|█████████████████████████████████████| 6320/6320 [00:05<00:00, 1115.40it/s]
100%|██████████████████████████████████████| 1520/1520 [00:06<00:00, 224.50it/s]
100%|███████████████████████████████████████| 1520/1520 [00:43<00:00, 34.81it/s]
100%|█████████████████████████████████████| 1520/1520 [00:01<00:00, 1126.68it/s]
100%|█████████████████████████████████████████| 240/240 [00:05<00:00, 46.86it/s]
100%|█████████████████████████████████████████| 240/240 [00:06<00:00, 34.73it/s]
100%|████████████████████████████████████████| 240/240 [00:00<00:00, 855.58it/s]


In [219]:
X_train[0].shape

(13,)

In [220]:
y_train, y_val, y_test = encode_labels(y_train, y_val, y_test)

In [221]:
np.size(y_val)

1520

In [222]:
X_train, X_val, X_test = standard_scaling(X_train, X_val, X_test)

In [223]:
X_train.shape

(6320, 13)

## Shuffle training data

In [224]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

## Hypeparameter optimization

In [225]:
# classweight 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {l:c for l,c in zip(np.unique(y_train), class_weights)}

In [226]:
rand_search = RandomizedSearchCV(estimator=svc, 
                                 param_distributions=param_grid_clf, 
                                 n_jobs = -1, 
                                 cv=KFold(3))

In [227]:
print("Performing Randomized Search...")
t0 = time()
rand_search.fit(X_train, y_train) 
print("\nDone in %0.3fs" % (time()-t0))
print() 

Performing Randomized Search...

Done in 1072.045s



In [228]:
best_params = rand_search.best_params_
best_clf = rand_search.best_estimator_
best_params

{'kernel': 'rbf', 'C': 100}

In [229]:
best_clf =SVC(C=best_params.get('C'), kernel=best_params.get('kernel'), class_weight=class_weights)
best_clf.fit(X_train, y_train)

SVC(C=100, class_weight={0: 1.381118881118881, 1: 0.7837301587301587})

## Testing

In [230]:
print("Testing...")
pred = best_clf.predict(X_test) 
accuracy = best_clf.score(X_test, y_test) 
print("Accuracy: %s" %str(accuracy))

Testing...
Accuracy: 0.575


## Save best model 

In [231]:
pkl_filename = "/home/helemanc/Desktop/Binary_Model/models_experiments/Experiment_6/model_6_12.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(best_clf, file)

In [232]:
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n"