In [27]:
# libraries
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, Flatten, Input
from tensorflow.keras.layers import Activation, BatchNormalization
from tensorflow.keras.layers import Conv1D, Conv2D, LSTM
from tensorflow.keras.layers import AveragePooling1D, GlobalAveragePooling2D, MaxPooling1D
from tensorflow.keras.models import Model, model_from_json, Sequential

# added this to plot
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [1]:
'''
Data Processing Part 3
Splitting with MFCC 40 features
Labels: Polarity
'''

import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# calling the Data Path file
ref_data_path = pd.read_csv('./Data_Array_Storage/Data_path.csv')

# opening df_features
with open('./Data_Array_Storage/data_features_mfcc40.pkl', 'rb') as f:
    df_features = pickle.load(f)

# opening df_features_noise as pickle file
with open('./Data_Array_Storage/data_features_noise_mfcc40.pkl', 'rb') as f:
    df_features_noise = pickle.load(f)

# opening error_list as pickle file
with open('./Data_Array_Storage/error_list_mfcc40.pkl', 'rb') as f:
    error_list = pickle.load(f)

print("Error list: ", len(error_list))

# changing lists into numpy arrays
# ref_data_path_array = np.array(ref_data_path) # do not need this
df_features_array = np.array(df_features)
df_features_noise_array = np.array(df_features_noise)

Error list:  0


In [2]:
# creating a y table that matches X table by doubling the ref_data_path
df_y_full = pd.concat((ref_data_path, ref_data_path),axis=0)
print('df_y_full: ', df_y_full.shape)

# creating X table of all dataset
X_full = np.concatenate((df_features, df_features_noise),axis=0)
print('X_full: ', X_full.shape)

# drop the columns
# 'gender','emotion','label_emotion', 'polarity', 'label_polarity','path','source'
# keep polarity
y_full = df_y_full.drop(['gender','emotion','label_emotion','label_polarity','path','source'],axis=1).to_numpy().squeeze()

print('y_full after drop: ', y_full[:5])

df_y_full:  (24324, 7)
X_full:  (24324, 216, 40)
y_full after drop:  ['negative' 'negative' 'neutral' 'positive' 'neutral']


In [3]:
new_X_full = X_full.reshape((24324,-1))

In [4]:
new_X_full.shape

(24324, 8640)

In [5]:
y_full.shape

(24324,)

In [6]:
y_full

array(['negative', 'negative', 'neutral', ..., 'positive', 'neutral',
       'negative'], dtype=object)

In [7]:
small_X = new_X_full[:100]

In [8]:
small_Y = y_full[:100]

In [18]:
counter = Counter(y_full)
print(counter)

Counter({'negative': 15384, 'positive': 5150, 'neutral': 3790})


In [None]:
!pip install imbalanced-learn

In [38]:
from imblearn.under_sampling import NearMiss

undersample = NearMiss (sampling_strategy = "not minority")

X_under, y_under = undersample.fit_resample(small_X, small_Y)
from collections import Counter
counter = Counter(y_under)
print(counter)

Counter({'negative': 17, 'neutral': 17, 'positive': 17})


In [39]:
X_under.shape

(51, 8640)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_under,
                                                     y_under,
                                                     test_size=0.1,
                                                     shuffle=True,
                                                     random_state=42)

In [13]:
X_train.shape

(45, 8640)

In [14]:
X_train_final = X_train.reshape((45,216,40))

In [16]:
X_train_final.shape

(45, 216, 40)

In [17]:
X_test.shape

(6, 8640)

In [18]:
X_test_final = X_test.reshape((6,216,40))

In [19]:
X_test_final.shape

(6, 216, 40)

In [30]:
y_train[:5]

array(['positive', 'positive', 'negative', 'negative', 'negative'],
      dtype=object)

In [31]:
y_test[:5]

array(['negative', 'neutral', 'positive', 'positive', 'negative'],
      dtype=object)

In [26]:
# one hot encode the target 
lb = LabelEncoder()
y_train_hot = tf.keras.utils.to_categorical(lb.fit_transform(y_train)) # tf.keras.utils.to_categorical
y_test_hot = tf.keras.utils.to_categorical(lb.transform(y_test))

In [21]:
y_train_hot[:5]

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

In [22]:
y_test_hot[:5]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

In [25]:
X_train.shape[0]

45

In [41]:
X_train_final = X_train.reshape((X_train.shape[0], X_full.shape[1], X_full.shape[2]))
X_test_final = X_test.reshape((X_test.shape[0], X_full.shape[1], X_full.shape[2]))

In [42]:
X_train_final.shape

(45, 216, 40)

In [44]:
X_test_final.shape

(6, 216, 40)

In [29]:
def model_d_conv1d(input_shape):
    model = Sequential()
    model.add(Conv1D(32, 3, padding='same',input_shape=input_shape))  # X_train.shape[1] = No. of Columns (216)
    model.add(Activation('relu'))
    model.add(Conv1D(32, 3, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(MaxPooling1D(pool_size=(3)))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(Dropout(0.25))
    model.add(MaxPooling1D(pool_size=(3))) # added drop out and maxpooling layer on 20201209 at 1330
    model.add(Activation('relu'))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(MaxPooling1D(pool_size=(3)))
    model.add(Conv1D(128, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(Conv1D(128, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(Conv1D(256, 3, padding='same'))
    model.add(Activation('relu'))
#     model.add(LSTM(64))
    model.add(Flatten())
    model.add(Dense(3)) # Target class number
    model.add(Activation('softmax'))
    
    # model optimizer
#     model = model_d_conv1d(input_shape)
    optimizer = tf.keras.optimizers.Adam()
    
    # compile model
    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

    return model

In [31]:
input_shape = (X_train_final.shape[1], X_train_final.shape[2])

In [32]:
model = model_d_conv1d(input_shape)
# optimizer = tf.keras.optimizers.RMSprop(lr=0.000001, decay=1e-6)

print('input shape, model, optimizer loaded')

model.summary()

input shape, model, optimizer loaded
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 216, 32)           3872      
_________________________________________________________________
activation (Activation)      (None, 216, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 216, 32)           3104      
_________________________________________________________________
batch_normalization (BatchNo (None, 216, 32)           128       
_________________________________________________________________
activation_1 (Activation)    (None, 216, 32)           0         
_________________________________________________________________
dropout (Dropout)            (None, 216, 32)           0         
_________________________________________________________________
max_pooling1d (MaxP

In [33]:
checkpoint_path = "./models_saved/model_d_conv1d_mfcc40_undersample100_pol.h5"

# Create a callback that saves the model's weights
callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=False,
                                                 save_best_only=True,
                                                 verbose=1), # 1 tells your which epoch is saving
#                                                  monitor='val_categorical_accuracy',  # added for emo1d
#                                                  mode='max'), # added for emo1d
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', # changed from 'val_accuracy', 'val_loss'
                                                patience=5, 
                                                restore_best_weights=True),
#                                                 verbose = 1, # added verbose for emo1d
#                                                 mode = 'min'), # added for emo1d
               tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                    patience=2, 
                                                    factor=0.5, 
                                                    min_lr=0.000001, 
                                                    verbose=1)]

print('callbacks and checkpoints set')

callbacks and checkpoints set


In [34]:
batch_size = 16

model_history=model.fit(X_train_final, 
                        y_train_hot,
                        batch_size=batch_size,
                        epochs=150,
                        validation_data=(X_test_final, y_test_hot),
                        verbose=2,
                        callbacks=callbacks)

Epoch 1/150

Epoch 00001: val_loss improved from inf to 1.29924, saving model to ./models_saved/model_d_conv1d_mfcc40_undersample100_pol.h5
3/3 - 0s - loss: 1.1521 - accuracy: 0.3333 - val_loss: 1.2992 - val_accuracy: 0.3333
Epoch 2/150

Epoch 00002: val_loss improved from 1.29924 to 1.20781, saving model to ./models_saved/model_d_conv1d_mfcc40_undersample100_pol.h5
3/3 - 0s - loss: 0.9590 - accuracy: 0.5333 - val_loss: 1.2078 - val_accuracy: 0.3333
Epoch 3/150

Epoch 00003: val_loss did not improve from 1.20781
3/3 - 0s - loss: 0.5834 - accuracy: 0.7333 - val_loss: 1.6271 - val_accuracy: 0.3333
Epoch 4/150

Epoch 00004: val_loss did not improve from 1.20781

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
3/3 - 0s - loss: 0.3987 - accuracy: 0.9111 - val_loss: 2.6583 - val_accuracy: 0.3333
Epoch 5/150

Epoch 00005: val_loss did not improve from 1.20781
3/3 - 0s - loss: 0.3547 - accuracy: 0.8667 - val_loss: 3.3734 - val_accuracy: 0.3333
Epoch 6/150

Epoch

In [None]:


# set random seed
np.random.seed(42)

# set indices to randomize
indices = np.random.permutation(len(X_full))
train_size = 0.8
len_train_set = int(len(X_full) * train_size)

X_shuffle = X_full[indices]
y_shuffle = y_full[indices]
X_train = X_shuffle[:len_train_set]
y_train = y_shuffle[:len_train_set]

X_test = X_shuffle[len_train_set:]
y_test = y_shuffle[len_train_set:]

print('shapes after np.random.permutation splits')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

print('y_test header 5')
print(y_test[:5])

print('X_test header 3')
print(X_test[:3])

# combining path with features
# changing features to list
# df_path_features = pd.concat([ref_data_path,pd.DataFrame(df_features['feature'].values.tolist())],axis=1)
# df_path_features_noise = pd.concat([ref_data_path,pd.DataFrame(df_features_noise['feature'].values.tolist())],axis=1)

# df_features_all = pd.concat([df_path_features,df_path_features_noise],axis=0,sort=False) # ,df_speedpitch
# df_final = df_features_all.fillna(0)

# df_final = pd.concat([ref_data_path, pd.DataFrame(df_features)],axis=1)

# # Split between train and test 
# X_train, X_test, y_train, y_test = train_test_split(df_final.drop(['gender','emotion','label_emotion','polarity','label_polarity','path','source'],axis=1),
#                                                     df_final.polarity,
#                                                     test_size=0.25,
#                                                     shuffle=True,
#                                                     random_state=42)

# print('Shape after train_test_split')
# print('X_train: ', X_train.shape)
# print('X_test: ', X_test.shape)
# print('y_train: ', y_train.shape)
# print('y_test: ', y_test.shape)

# Data normalization 
# original 
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

# new methood to data normazilie over each individual
# mean = np.mean(np.reshape(X_train, (X_train.shape[0], -1)), axis=1) # (1000,)
# std = np.std(np.reshape(X_train, (X_train.shape[0], -1)), axis=1)   # (1000,)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Preparation steps to get it into the correct format for Keras 
# X_train = np.array(X_train)
# y_train = np.array(y_train)
# X_test = np.array(X_test)
# y_test = np.array(y_test)

print('Shape after data normalization for X_ only')
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

# one hot encode the target 
lb = LabelEncoder()
y_train = tf.keras.utils.to_categorical(lb.fit_transform(y_train)) # tf.keras.utils.to_categorical
y_test = tf.keras.utils.to_categorical(lb.fit_transform(y_test))

print('Shape after one hot encode (for y_ only)')
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

print('y_test top 5', y_test[:5])

# save y_train, y_test
with open('./Data_Array_Storage/y_train_mfcc40_axis0_pol.pkl', 'wb') as f:
    pickle.dump(y_train, f)

with open('./Data_Array_Storage/y_test_mfcc40_axis0_pol.pkl', 'wb') as f:
    pickle.dump(y_test, f)

# Pickel the lb object for future use 
with open('./Data_Array_Storage/labels_mfcc40_axis0_pol.pkl', 'wb') as f:
    pickle.dump(lb, f)

# expanding X_train and X_test dimensions
# no need to do this for conv1d
# X_train = np.expand_dims(X_train, axis=-1)
# X_test = np.expand_dims(X_test, axis=-1)

# print('Shape after X dimension expansion')
# print('X_train: ', X_train.shape)
# print('X_test: ', X_test.shape)
# print('y_train: ', y_train.shape)
# print('y_test: ', y_test.shape)

# saving X_train and X_test
with open('./Data_Array_Storage/X_train_mfcc40_axis0_pol.pkl', 'wb') as f:
    pickle.dump(X_train, f)

with open('./Data_Array_Storage/X_test_mfcc40_axis0_pol.pkl', 'wb') as f:
    pickle.dump(X_test, f)

print('Pickle files saved. Final shpaes:')
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)