In [None]:
import pickle
import logging
import models
import lib
import resources
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from my_classes import DataGenerator
import os

In [None]:
import keras
from keras.applications import vgg16, resnet50, inception_v3
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding, Flatten, Lambda
from keras import layers, models
from keras import backend as K

In [None]:
keras.__version__

In [None]:
def extract():
    """
    Downloads raw data needed and extracts data for all 3 models.
    Image - Convert mp4 files into a series of jpeg images
    Audio - Extract mp3 files from each mp4 file
    Text - Extract text from annotation files
    :return:
    """

    # Extract images, audio files, and text transcripts for each partition
    for partition in ['training', 'test', 'validation']:

        # Chop video up into images and save into separate directory
        lib.extract_images(partition, num_frames=10)

        # Strip audio from mp4 and save in separate directory
        lib.extract_audio(partition)

        # Take text from transcripts
        lib.extract_text(partition)

    pass

In [None]:
#extract()

In [None]:
def transform():
    """
    Transforms all features for the 3 models.
    Image - Convert jpegs to numpy arrays and preprocess for the vgg16 model
    Audio - Use librosa to extract features and save dataframe with all features for each video
    Text - Tokenize, and convert to indices based on the google news 20 word embeddings
    :return:
    """
    
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    #for the base case, do for all partitions
    for partition in ['training', 'test','validation']:#, 'validation']:

        # Transform raw jpegs into numpy arrays
        #lib.transform_images(partition=partition, num_frames=10)

        # Transform raw audio to feature matrix
        #lib.transform_audio(partition=partition, n_mfcc=13)

        # Transform text to tokens
        lib.transform_text(partition=partition, word_to_index=word_to_index)

    pass

In [None]:
transform()

# Models (saving only the weights)

In [None]:
def image_lrcn(l_phase=1):
    """
    Model that takes in still frames of the video and uses vgg16 to eaxtract features from image.
    Those features are then fed to a lstm to understand the temporal aspect of the videos.
    :return: keras model object
    """

    # Set learning phase to 0
    K.set_learning_phase(l_phase)

    # Set input layer
    video = layers.Input(shape=(None, 224, 224, 3), name='video_input')

    # Load the VGG16 model
    cnn = vgg16.VGG16(weights="imagenet", include_top=False, pooling='max')
    cnn.trainable = True

    # Wrap cnn into Lambda and pass it into TimeDistributed
    encoded_frame = layers.TimeDistributed(Lambda(lambda x: cnn(x)))(video)
    encoded_vid = layers.LSTM(64)(encoded_frame)
    encoded_vid = layers.Dropout(.05)(encoded_vid)
    adam_opt = keras.optimizers.Adam(lr=0.0005, decay=0.001)
    outputs = layers.Dense(6, activation='relu')(encoded_vid)
    model = models.Model(inputs=[video], outputs=outputs)
    model.compile(optimizer=adam_opt, loss='mean_squared_error')

    return model


In [None]:
def image_lrcn_resnet50():
    """
    Model that takes in still frames of the video and uses vgg16 to eaxtract features from image.
    Those features are then fed to a lstm to understand the temporal aspect of the videos.
    :return: keras model object
    """

    # Set learning phase to 0
    #K.set_learning_phase(0)

    # Set input layer
    video = layers.Input(shape=(None, 224, 224, 3), name='video_input')

    # Load the VGG16 model
    cnn = resnet50.ResNet50(weights="imagenet", include_top=False, pooling='max')
    cnn.trainable = True

    # Wrap cnn into Lambda and pass it into TimeDistributed
    encoded_frame = layers.TimeDistributed(Lambda(lambda x: cnn(x)))(video)
    encoded_vid = layers.LSTM(64)(encoded_frame)
    encoded_vid = layers.Dropout(.05)(encoded_vid)
    adam_opt = keras.optimizers.Adam(lr=0.0005, decay=0.001)
    outputs = layers.Dense(6, activation='relu')(encoded_vid)
    model = models.Model(inputs=[video], outputs=outputs)
    model.compile(optimizer=adam_opt, loss='mean_squared_error')

    return model

In [None]:
def image_lrcn_inceptionv3():
    """
    Model that takes in still frames of the video and uses vgg16 to eaxtract features from image.
    Those features are then fed to a lstm to understand the temporal aspect of the videos.
    :return: keras model object
    """

    # Set learning phase to 0
    #K.set_learning_phase(0)

    # Set input layer
    video = layers.Input(shape=(None, 224, 224, 3), name='video_input')

    # Load the VGG16 model
    cnn = inception_v3.InceptionV3(weights="imagenet", include_top=False, pooling='max')
    cnn.trainable = True

    # Wrap cnn into Lambda and pass it into TimeDistributed
    encoded_frame = layers.TimeDistributed(Lambda(lambda x: cnn(x)))(video)
    encoded_vid = layers.LSTM(64)(encoded_frame)
    encoded_vid = layers.Dropout(.05)(encoded_vid)
    adam_opt = keras.optimizers.Adam(lr=0.0005, decay=0.001)
    outputs = layers.Dense(1, activation='linear')(encoded_vid)
    model = models.Model(inputs=[video], outputs=outputs)
    model.compile(optimizer=adam_opt, loss='mean_squared_error')

    return model

In [None]:
def text_lstm_model(embedding_matrix):
    """
    Generate a convolutional neural network model, with an embedding layer.
    :param embedding_matrix: An embedding matrix, with shape (n,m), where n is the number of words, and m is the
    dimensionality of the embedding
    :return: keras model object
    """

    # Number of words in the word lookup index
    embedding_input_dim = embedding_matrix.shape[0]

    # Number of dimensions in the embedding
    embedding_output_dim = embedding_matrix.shape[1]

    # Maximum length of the x vectors
    embedding_input_length = 80

    print('embedding_input_dim: {}, embedding_output_dim: {}, embedding_input_length: {}'
                 .format(embedding_input_dim, embedding_output_dim, embedding_input_length))

    # Define model architecture
    embedding_layer = Embedding(input_dim=embedding_input_dim,
                                output_dim=embedding_output_dim,
                                weights=[embedding_matrix],
                                input_length=embedding_input_length,
                                trainable=False)
    sequence_input = keras.Input(shape=(embedding_input_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Dropout(.5)(embedded_sequences)
    x = Bidirectional(LSTM(64))(x)
    x = Dropout(.5)(x)
    x = Dense(units=64, activation='relu')(x)
    x = Dropout(.5)(x)
    preds = Dense(units=1, activation='linear')(x)

    # Compile architecture
    text_model = Model(sequence_input, preds)
    text_model.compile(loss='mse', optimizer='adam')

    return text_model

In [None]:
def model(image=True, audio=False, text=False):
    """
    Train all 3 models
    :param image: Whether or not to train the image model on this run
    :param audio: Whether or not to train the audio model on this run
    :param text: Whether or not to train the text model on this run
    :return:
    """

    #To run natively do not use multiprocessing
    if image:

        # Parameters
        params = {'dim': (10, 224, 224),
                  'batch_size': 8,
                  'n_channels': 3,
                  'shuffle': True}

        # Load labels set
        with open('../data/image_data/pickle_files/y_5d_training_all.pkl', 'rb') as file:
            training_labels = pickle.load(file)
        with open('../data/image_data/pickle_files/y_5d_validation_all.pkl', 'rb') as file:
            test_labels = pickle.load(file)

        # Generators
        training_generator = DataGenerator(partition='training',
                                           list_IDs=range(6000),
                                           labels=training_labels, **params)
        validation_generator = DataGenerator(partition='validation',
                                             list_IDs=range(2000),
                                             labels=test_labels, **params)
        

        # Create model
        model = image_lrcn()
        #model.load_weights('../output/image_model_vgg.h5')

        # Train model on data set
        t0 = np.load('../data/image_data/npy_files/test_data/0.npy')
        pred0 = model.predict(t0)
        print(pred0)
        
        t1 = np.load('../data/image_data/npy_files/test_data/1.npy')
        pred1 = model.predict(t1)
        print(pred1)
        
        model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            use_multiprocessing=False,
                            workers=1,
                            epochs=1)
        t0 = np.load('../data/image_data/npy_files/test_data/0.npy')
        pred0 = model.predict(t0)
        print(pred0)
        
        t1 = np.load('../data/image_data/npy_files/test_data/1.npy')
        pred1 = model.predict(t1)
        print(pred1)

        model.save_weights('../output/image_model_vgg_all.h5')

    if audio:

        # Read in audio data
        training_set = pd.read_csv('../data/audio_data/pickle_files/training_df.csv')
        test_set = pd.read_csv('../data/audio_data/pickle_files/validation_df.csv')

        # Concat data sets in order to use all data for CV
        all_data = pd.concat((training_set, test_set), axis=0)
        X_all = all_data.drop(['interview_score', 'extraversion', 'agreeableness', 'conscientiousness', 'neuroticism', 'openness', 'video_id'], axis=1)
        y_all = all_data['interview_score', 'extraversion', 'agreeableness', 'conscientiousness', 'neuroticism', 'openness']

        logging.info('Start training audio model')

        # Create model and fit to data
        audio_model = models.audio_rand_forest()
        audio_model.fit(X_all, y_all)

        logging.info(audio_model.best_params_)
        logging.info('Train score with best estimator: {}'.format(max(audio_model.cv_results_['mean_train_score'])))
        logging.info('Validation score with best estimator: {}'.format(max(audio_model.cv_results_['mean_test_score'])))

        # Save to disk
        with open('../output/audio_model.pkl', 'wb') as fid:
            pickle.dump(audio_model, fid)

    if text:

        # Load in word embeddings
        embedding_matrix, word_to_index = resources.create_embedding_matrix()
        print(len(word_to_index))

        # Load text data
        with open('../data/text_data/pickle_files/X_training.pkl', 'rb') as file:
            X_train = pickle.load(file)
        with open('../data/text_data/pickle_files/y_training.pkl', 'rb') as file:
            y_train = pickle.load(file)
        with open('../data/text_data/pickle_files/X_validation.pkl', 'rb') as file:
            X_test = pickle.load(file)
        with open('../data/text_data/pickle_files/y_validation.pkl', 'rb') as file:
            y_test = pickle.load(file)
        
        print(X_test)

        # Create model objec and fit
        text_model = text_lstm_model(embedding_matrix=embedding_matrix)
        filename = '../output/text_model_2.h5'
        checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        text_model.fit(X_train, y_train,
                       batch_size=16, epochs=55,
                       validation_data=(X_test, y_test),
                       callbacks=[checkpoint],
                       shuffle=True)

    pass

In [None]:
model()

In [None]:
def ensemble():

    print('Begin Ensemble model building, loading models')

    # Load models
    image_model = image_lrcn(0)
    image_model.load_weights('../output/image_model_vgg_all.h5')
    audio_model = pickle.load(open('../output/audio_model.pkl', 'rb'))
    text_model = load_model('../output/text_model.h5')

    # Load labels set
    with open('../data/image_data/pickle_files/y_5d_training_all.pkl', 'rb') as file:
        training_labels = pickle.load(file)
    with open('../data/image_data/pickle_files/y_5d_test_all.pkl', 'rb') as file:
        test_labels = pickle.load(file)
    with open('../data/image_data/pickle_files/y_5d_validaion_all.pkl', 'rb') as file:
        validation_labels = pickle.load(file)
    # Load generators
    training_generator = DataGenerator(partition='training', list_IDs=range(6000),
                                       labels=training_labels, batch_size=8,
                                       n_channels=3, dim=(10, 224, 224),
                                       shuffle=False)
    validation_generator = DataGenerator(partition='test', list_IDs=range(2000),
                                         labels=test_labels, batch_size=8,
                                         n_channels=3, dim=(10, 224, 224),
                                         shuffle=False)
    holdout_generator = DataGenerator(partition='validation', list_IDs=range(2000),
                                      labels=validation_labels, batch_size=8,
                                      n_channels=3, dim=(10, 224, 224),
                                      shuffle=False)

    print('Load data files')

    # Load image data
    with open('../data/image_data/pickle_files/y_5d_training_all.pkl', 'rb') as file:
        y_img_train = pickle.load(file)
    with open('../data/image_data/pickle_files/y_5d_test_all.pkl', 'rb') as file:
        y_img_test = pickle.load(file)
    with open('../data/image_data/pickle_files/y_5d_validation_all.pkl', 'rb') as file:
        y_img_val = pickle.load(file)
    with open('../data/image_data/pickle_files/vid_ids_5d_training.pkl', 'rb') as file:
        id_img_train = pickle.load(file)
    with open('../data/image_data/pickle_files/vid_ids_5d_test.pkl', 'rb') as file:
        id_img_test = pickle.load(file)
    with open('../data/image_data/pickle_files/vid_ids_5d_validation.pkl', 'rb') as file:
        id_img_val = pickle.load(file)

    # Load audio data
    aud_train = pd.read_csv('../data/audio_data/pickle_files/training_df.csv')
    aud_test = pd.read_csv('../data/audio_data/pickle_files/test_df.csv')
    aud_val = pd.read_csv('../data/audio_data/pickle_files/validation_df.csv')
    X_aud_train = aud_train.drop(['interview_score', 'video_id'], axis=1)
    id_aud_train = aud_train['video_id']
    X_aud_test = aud_test.drop(['interview_score', 'video_id'], axis=1)
    id_aud_test = aud_test['video_id']
    X_aud_val = aud_val.drop(['interview_score', 'video_id'], axis=1)
    id_aud_val = aud_val['video_id']

    # Load text data
    with open('../data/text_data/pickle_files/X_training.pkl', 'rb') as file:
        X_text_train = pickle.load(file)
    with open('../data/text_data/pickle_files/X_test.pkl', 'rb') as file:
        X_text_test = pickle.load(file)
    with open('../data/text_data/pickle_files/X_validation.pkl', 'rb') as file:
        X_text_val = pickle.load(file)
    with open('../data/text_data/pickle_files/vid_ids_training.pkl', 'rb') as file:
        id_text_train = pickle.load(file)
    with open('../data/text_data/pickle_files/vid_ids_test.pkl', 'rb') as file:
        id_text_test = pickle.load(file)
    with open('../data/text_data/pickle_files/vid_ids_validation.pkl', 'rb') as file:
        id_text_val = pickle.load(file)

    print('Getting predictions for all 3 models')

    # Get predictions
    img_train_df = pd.DataFrame({'img_preds': [i[0] for i in image_model.predict_generator(training_generator)],
                                 'video_ids': id_img_train,
                                 'interview_score': y_img_train})
    img_test_df = pd.DataFrame({'img_preds': [i[0] for i in image_model.predict_generator(validation_generator)],
                                'video_ids': id_img_test,
                                'interview_score':y_img_test})
    img_val_df = pd.DataFrame({'img_preds': [i[0] for i in image_model.predict_generator(holdout_generator)],
                               'video_ids': id_img_val,
                               'interview_score': y_img_val})
    aud_train_df = pd.DataFrame({'aud_preds': audio_model.predict(X_aud_train),
                                 'video_ids': id_aud_train})
    aud_test_df = pd.DataFrame({'aud_preds': audio_model.predict(X_aud_test),
                                'video_ids': id_aud_test})
    aud_val_df = pd.DataFrame({'aud_preds': audio_model.predict(X_aud_val),
                               'video_ids': id_aud_val})
    text_train_df = pd.DataFrame({'text_preds': [i[0] for i in text_model.predict(X_text_train)],
                                  'video_ids': id_text_train})
    text_test_df = pd.DataFrame({'text_preds': [i[0] for i in text_model.predict(X_text_test)],
                                 'video_ids': id_text_test})
    text_val_df = pd.DataFrame({'text_preds': [i[0] for i in text_model.predict(X_text_val)],
                                'video_ids': id_text_val})

    print('Merge predictions together into single data frame')

    # Merge predictions
    train_preds = img_train_df.merge(aud_train_df, on='video_ids')
    train_preds = train_preds.merge(text_train_df, on='video_ids')
    test_preds = img_test_df.merge(aud_test_df, on='video_ids')
    test_preds = test_preds.merge(text_test_df, on='video_ids')
    val_preds = img_val_df.merge(aud_val_df, on='video_ids')
    val_preds = val_preds.merge(text_val_df, on='video_ids')

    # Score models
    img_train_score = np.sqrt(mean_squared_error(train_preds['interview_score'], train_preds['img_preds']))
    img_test_score = np.sqrt(mean_squared_error(test_preds['interview_score'], test_preds['img_preds']))
    img_val_score = np.sqrt(mean_squared_error(val_preds['interview_score'], val_preds['img_preds']))
    aud_train_score = np.sqrt(mean_squared_error(train_preds['interview_score'], train_preds['aud_preds']))
    aud_test_score = np.sqrt(mean_squared_error(test_preds['interview_score'], test_preds['aud_preds']))
    aud_val_score = np.sqrt(mean_squared_error(val_preds['interview_score'], val_preds['aud_preds']))
    text_train_score = np.sqrt(mean_squared_error(train_preds['interview_score'], train_preds['text_preds']))
    text_test_score = np.sqrt(mean_squared_error(test_preds['interview_score'], test_preds['text_preds']))
    text_val_score = np.sqrt(mean_squared_error(val_preds['interview_score'], val_preds['text_preds']))

    # Print scores to screen
    print('Image score on the training set: {}'.format(img_train_score))
    print('Image score on the test set: {}'.format(img_test_score))
    print('Image score on the val set: {}'.format(img_val_score))
    print('Audio score on the training set: {}'.format(aud_train_score))
    print('Audio score on the test set: {}'.format(aud_test_score))
    print('Audio score on the val set: {}'.format(aud_val_score))
    print('Text score on the training set: {}'.format(text_train_score))
    print('Text score on the test set: {}'.format(text_test_score))
    print('Text score on the val set: {}'.format(text_val_score))

    # Split target variable and features
    X_train = train_preds[['img_preds', 'aud_preds', 'text_preds']]
    y_train = train_preds[['interview_score']]
    X_test = test_preds[['img_preds', 'aud_preds', 'text_preds']]
    y_test = test_preds[['interview_score']]
    X_val = val_preds[['img_preds', 'aud_preds', 'text_preds']]
    y_val = val_preds[['interview_score']]

    print('Build OLS model to combine model outputs')

    # Build OLS model
    ols_model = LinearRegression()
    ols_model.fit(X_train, y_train)
    
    pred_test = ols_model.predict(X_test)
    #print(len(pred_test))

    # Score model
    train_score = np.sqrt(mean_squared_error(y_train, ols_model.predict(X_train)))
    test_score = np.sqrt(mean_squared_error(y_test, pred_test))
    val_score = np.sqrt(mean_squared_error(y_val, ols_model.predict(X_val)))
    
    #test value dataframe
    '''test_df = pd.DataFrame(columns=['id','actual','pred'])
    test_df['id'] = train_preds['video_ids']
    test_df['actual'] = y_test
    test_df['pred'] = pred_test
    
    test_df.to_csv('../output/prediction.csv', index = False)'''

    print('OLS Score on training set: {}'.format(train_score))
    print('OLS Score on test set: {}'.format(test_score))
    print('OLS Score on val set: {}'.format(val_score))

    # Save model
    with open('../output/ensemble_model.pkl', 'wb') as fid:
        pickle.dump(ols_model, fid)

    logging.info('Ensemble model saved')

    return

In [None]:
ensemble()

# Extracting targets from annotation files

In [None]:
with open('../data/meta_data/annotation_validation.pkl','rb') as f:
        label_file = pickle.load(f, encoding='latin1')
        #Extraversion, Agreeableness, Conscientiousness, Neuroticism and Openness.

In [None]:
label_file['openness']

In [None]:
# Image data
for partition in ['training', 'validation','test']:
    with open('../data/meta_data/annotation_{}.pkl'.format(partition), 'rb') as f:
        label_file = pickle.load(f, encoding='latin1')

    # Get all IDs for videos for the training set
    vid_ids = os.listdir('../data/image_data/{}_data'.format(partition))
    file_ids = [i + '.mp4' for i in vid_ids]

    
    y_interview = [label_file['interview'][i + '.mp4'] for i in vid_ids]
    y_extraversion = [label_file['extraversion'][i + '.mp4'] for i in vid_ids]
    y_agreeableness = [label_file['agreeableness'][i + '.mp4'] for i in vid_ids]
    y_conscientiousness = [label_file['conscientiousness'][i + '.mp4'] for i in vid_ids]
    y_neuroticism = [label_file['neuroticism'][i + '.mp4'] for i in vid_ids]
    y_openness = [label_file['openness'][i + '.mp4'] for i in vid_ids]
    
    y = np.zeros((len(y_interview), 6))
    for i in range(len(y_interview)):
        y[i][0] = y_interview[i]
        y[i][1] = y_extraversion[i]
        y[i][2] = y_agreeableness[i]
        y[i][3] = y_conscientiousness[i]
        y[i][4] = y_neuroticism[i]
        y[i][5] = y_openness[i]
    
    with open('../data/image_data/pickle_files/y_5d_{}_all.pkl'.format(partition), 'wb') as output:
        pickle.dump(y, output, protocol=4)

In [None]:
#Audio data
for partition in ['training', 'validation','test']:
    with open('../data/meta_data/annotation_{}.pkl'.format(partition), 'rb') as f:
            label_file = pickle.load(f, encoding='latin1')

    audio_files = os.listdir('../data/audio_data/{}_data'.format(partition))
    audio_files = [i.split('.wav')[0] for i in audio_files]
    id_array = [i + '.mp4' for i in audio_files]


    score_interview = [label_file['interview'][i + '.mp4'] for i in audio_files]
    score_extraversion = [label_file['extraversion'][i + '.mp4'] for i in audio_files]
    score_agreeableness = [label_file['agreeableness'][i + '.mp4'] for i in audio_files]
    score_conscientiousness = [label_file['conscientiousness'][i + '.mp4'] for i in audio_files]
    score_neuroticism = [label_file['neuroticism'][i + '.mp4'] for i in audio_files]
    score_openness = [label_file['openness'][i + '.mp4'] for i in audio_files]
    
    score_interview = np.array(score_interview)
    score_extraversion = np.array(score_extraversion)
    score_agreeableness = np.array(score_agreeableness)
    score_conscientiousness = np.array(score_conscientiousness)
    score_neuroticism = np.array(score_neuroticism)
    score_openness = np.array(score_openness)
    
    audio_df = pd.DataFrame(audio_matrix, columns=cols)
    audio_df['interview_score'] = score_interview
    audio_df['extraversion'] = score_extraversion
    audio_df['agreeableness'] = score_agreeableness
    audio_df['conscientiousness'] = score_conscientiousness
    audio_df['neuroticism'] = score_neuroticism
    audio_df['openness'] = score_openness
    
    audio_df['video_id'] = id_array

    #print(id_array[0:20])

    audio_df.to_csv('../data/audio_data/pickle_files/{}_df.csv'.format(partition, partition), index=False)