In [1]:
%matplotlib inline

In [2]:
import os
import glob

import numpy as np
import pandas as pd

import tensorflow as tf

from typing import *
from collections import Counter

import sklearn
from sklearn import model_selection


In [64]:
import random
seed = random.randint(1, 500)
random.seed(seed)
test1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(3, 3)
test2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(3, 3)
random.shuffle(test1)
random.seed(seed)
random.shuffle(test2)
print(test1, test2)

[[1 2 3]
 [4 5 6]
 [4 5 6]] [[1 2 3]
 [4 5 6]
 [4 5 6]]


In [10]:
# Note: this is a slightly modified version of the code from the preprocessing_pipeline.py 
# script that can stand alone in a notebook

# to skip scaling... 

def get_text_data(input_dir):
    """
    Purpose: preprocess liwc and nrc
    Input
        input_dir {string} : path to input_directory (ex, "~/Train")
    Output:
        id_list {numpy array of strings}: array of user ids sorted alphabetically,
                                        to determine order of features and labels DataFrames
        text_data {pandas DataFrame of float}: unscaled text data (liwc and nrc combined)
    """
    # Load and sort text data
    liwc = pd.read_csv(os.path.join(input_dir, 'Text/liwc.csv'), sep = ',')
    liwc = liwc.sort_values(by=['userId'])

    nrc = pd.read_csv(os.path.join(input_dir, 'Text/nrc.csv'), sep = ',')
    nrc = nrc.sort_values(by=['userId'])

    # Build list of subject ids ordered alphabetically
    # Check if same subject lists in both sorted DataFrames (liwc and nrc)
    if np.array_equal(liwc['userId'], nrc['userId']):
        id_list = liwc['userId'].to_numpy()
    else:
        raise Exception('userIds do not match between liwc and nrc data')

    # merge liwc and nrc DataFrames using userId as index
    liwc.set_index('userId', inplace=True)
    nrc.set_index('userId', inplace=True)

    text_data = pd.concat([liwc, nrc], axis=1, sort=False)

    return id_list, text_data


def get_image_clean(sub_ids, oxford, means):
    '''
    Purpose: preprocess oxford metrics derived from profile pictures (part 2)
    Input:
        sub_ids {numpy array of strings}: ordered list of userIDs
        oxford {pandas DataFrame of floats}: unscaled oxford features of users with 1+ face
        means {list of float}: mean values for each feature averaged from train set,
                    to replace missing values for userids with no face (train and test set)
    Output:
        image_data {pandas DataFrame of float}: unscaled oxford image data
                with mean values replacing missing entries
    '''
    # list of ids with at least one face on image: 7174 out of 9500 in train set
    ox_list = np.sort(oxford['userId'].unique(), axis=None)
    # list of ids in text_list who have no face metrics in oxford.csv (2326 in train set)
    ox_noface = np.setdiff1d(sub_ids, ox_list)

    # Create DataFrame for userids with no face (1 row per userid)
    # values are mean metrics averaged from users with entries (training set)
    ox_nf = pd.DataFrame(ox_noface, columns = ['userId'])
    columns = oxford.columns[2:].tolist()
    for column, mean in zip(columns, means):
        ox_nf.insert(loc=ox_nf.shape[1], column=column, value=mean, allow_duplicates=True)
    # insert column 'noface' = 1 if no face in image, else 0
    ox_nf.insert(loc=ox_nf.shape[1], column='noface', value=1, allow_duplicates=True)
    # insert column 'multiface' = 1 if many faces in image, else 0
    ox_nf.insert(loc=ox_nf.shape[1], column='multiface', value=0, allow_duplicates=True)
    ox_nf.set_index('userId', inplace=True)

    # Format DataFrame from userids with 1+ face
    # insert column 'noface' = 1 if no face in image, else 0
    oxford.insert(loc=oxford.shape[1], column='noface', value=0, allow_duplicates=True)
    # list userIds with multiple faces (714 in train set)
    ox_multiples = oxford['userId'][oxford['userId'].duplicated()].tolist()
    # insert column 'multiface' = 1 if many faces in image, else 0
    oxford.insert(loc=oxford.shape[1], column='multiface', value=0, allow_duplicates=True)
    multi_mask = pd.Series([uid in ox_multiples for uid in oxford['userId']])
    i = oxford[multi_mask].index
    oxford.loc[i, 'multiface'] = 1
    # drop duplicate entries with same userId (keep first entry per userId)
    oxford.drop_duplicates(subset ='userId', keep='first', inplace=True)

    # merge the two DataFrames
    oxford.drop(['faceID'], axis=1, inplace=True)
    oxford.set_index('userId', inplace=True)
    image_data = pd.concat([ox_nf, oxford], axis=0, sort=False).sort_values(by=['userId'])

    if not np.array_equal(image_data.index, sub_ids):
        raise Exception('userIds do not match between oxford file and id list')

    return image_data


def get_image_raw(data_dir):
    '''
    Purpose: preprocess oxford metrics derived from profile pictures (part 1)
    Input
        input_dir {string} : path to input_directory (ex, "~/Train")
    Output:
        image_data {pandas DataFrame of float}: unscaled oxford image data
    '''
    # Load data of oxford features extracted from profile picture (face metrics)
    # 7915 entries; some users have no face, some have multiple faces on image.
    # userids with 1+ face on image: 7174 out of 9500 (train set)
    # duplicated entries (userids with > 1 face on same image): 741 in train set
    oxford = pd.read_csv(os.path.join(data_dir, "Image", "oxford.csv"), sep = ',')
    #oxford = oxford.sort_values(by=['userId'])
    '''
    NOTE: headPose_pitch has NO RANGE, drop that feature
    '''
    oxford.drop(['headPose_pitch'], axis=1, inplace=True)

    return oxford


def get_likes_kept(data_dir, num_features) -> List[str]:
    '''
    Purpose: get list of likes to keep as features
    Input:
        data_dir {str} : the parent input directory
        num_features {int} : the number of likes to keep as features,
                        starting from those with highest frequencies
    Output:
        freq_like_id {List of strings}: frequency of most frequent likes,
                    (number = num_features), in descending ordered, indexed by like_id
    '''
    #Why return frequency?
    relation = pd.read_csv(os.path.join(data_dir, "Relation", "Relation.csv")) #, index_col=1)
    relation = relation.drop(['Unnamed: 0'], axis=1)
    like_ids_to_keep = relation['like_id'].value_counts(sort=True, ascending=False)[:num_features] #This sorts features by frequency

    #sort like indices (which are the keys associated with the values kepts)
    likes_int64_list = sorted(like_ids_to_keep.keys()) # This sorts indices by like_id
    likes_str_list = [str(l) for l in likes_int64_list]
    return likes_str_list


def get_relations(data_dir: str, sub_ids: List[str], like_ids_to_keep: List[str]):
    '''
    Purpose: preprocess relations dataset ('likes')

    Input:
        data_dir {str} -- the parent input directory
        sub_ids {numpy array of strings} -- the ordered list of userids
        like_ids_to_keep {List[str]} -- The list of page IDs to keep.

    Returns:
        relations_data -- multihot matrix of the like_id. Rows are indexed with userid, entries are boolean.
    '''
    relation = pd.read_csv(os.path.join(data_dir, "Relation", "Relation.csv")) #, index_col=1)
    relation = relation.drop(['Unnamed: 0'], axis=1)

    ## One HUGE step:
    # likes_to_keep = like_ids_to_keep.keys()
    # kept_relations = relation[relation.like_id.isin(likes_to_keep)]
    # multi_hot_relations = pd.get_dummies(kept_relations, columns=["like_id"], prefix="")
    # multi_hot = multi_hot_relations.groupby(("userid")).sum()
    # return multi_hot_relations
    ###
    total_num_pages = len(like_ids_to_keep)
    # Create a multihot likes matrix of booleans (rows = userids, cols = likes), by batch
    batch_size = 1000

    # Create empty DataFrame with sub_ids as index list
    relation_data = pd.DataFrame(sub_ids, columns = ['userid'])
    relation_data.set_index('userid', inplace=True)

    for start_index in range(0, total_num_pages, batch_size):
        end_index = min(start_index + batch_size, total_num_pages)

        # sets are better for membership testing than lists.
        like_ids_for_this_batch = set(like_ids_to_keep[start_index:end_index])

        filtered_table = relation[relation['like_id'].isin(like_ids_for_this_batch)]
        ## THIS is the slow part:
        relHot = pd.get_dummies(filtered_table, columns=['like_id'], prefix="", prefix_sep="")
        ##
        relHot = relHot.groupby(['userid']).sum().astype(float) # this makes userid the index

        relation_data = pd.concat([relation_data, relHot], axis=1, sort=True)

    relation_data = relation_data.reindex(like_ids_to_keep, axis=1)
    relation_data.fillna(0.0, inplace=True)
    relation_data = relation_data.astype("bool")

    # will be different if users in relation.csv are not in sub_ids
    if not np.array_equal(relation_data.index, sub_ids):
        raise Exception(f"""userIds do not match between relation file and id list:
    {relation_data.index}
    {sub_ids}

    """)

    return relation_data


def make_label_dict(labels):
    '''
    Purpose: make dictionnary of labels from pandas DataFrame
    Input:
        labels {pandas DataFrame}: labels ordered per userids (alphabetical order)
    Output:
        labels_dict {dictionary of pandas DataFrames}: labels (one entry per metric) ordered alphabetically
                by userid for the training set, with userids as index.

    '''
    gender = labels['gender']

    age_grps = labels[['age_xx_24', 'age_25_34', 'age_35_49', 'age_50_xx']]

    '''
    Note: : each DataFrames (value) is indexed by userid in labels_dict
    '''
    labels_dict = {}
    labels_dict['userid'] = labels.index
    labels_dict['gender'] = gender
    labels_dict['age_grps'] = age_grps
    labels_dict['ope'] = labels['ope']
    labels_dict['con'] = labels['con']
    labels_dict['ext'] = labels['ext']
    labels_dict['agr'] = labels['agr']
    labels_dict['neu'] = labels['neu']

    return labels_dict


def preprocess_labels(data_dir, sub_ids):
    '''
    Purpose: preprocess entry labels from training set
    Input:
        datadir {string} : path to training data directory
        sub_ids {numpy array of strings}: list of subject ids ordered alphabetically
    Output:
        labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.

    '''
    labels = pd.read_csv(os.path.join(data_dir, "Profile", "Profile.csv"))

    def age_group_id(age_str: str) -> int:
        """Returns the age group category ID (an integer from 0 to 3) for the given age (string)

        Arguments:
            age_str {str} -- the age

        Returns:
            int -- the ID of the age group: 0 for xx-24, 1 for 25-34, 2 for 35-49 and 3 for 50-xx.
        """
        age = int(age_str)
        if age <= 24:
            return 0
        elif age <= 34:
            return 1
        elif age <= 49:
            return 2
        else:
            return 3

    labels = labels.assign(age_group = lambda dt: pd.Series([age_group_id(age_str) for age_str in dt["age"]]))
    # labels = labels.assign(age_xx_24 = lambda dt: pd.Series([int(age) <= 24 for age in dt["age"]]))
    # labels = labels.assign(age_25_34 = lambda dt: pd.Series([25 <= int(age) <= 34 for age in dt["age"]]))
    # labels = labels.assign(age_35_49 = lambda dt: pd.Series([35 <= int(age) <= 49 for age in dt["age"]]))
    # labels = labels.assign(age_50_xx = lambda dt: pd.Series([50 <= int(age) for age in dt["age"]]))

    labels = labels.sort_values(by=['userid'])
    # check if same subject ids in labels and sub_ids
    if not np.array_equal(labels['userid'].to_numpy(), sub_ids):
        raise Exception('userIds do not match between profiles labels and id list')
    
    labels = labels.drop(['Unnamed: 0'], axis=1)
    labels.set_index('userid', inplace=True)

    return labels


def preprocess_train(data_dir, num_likes=10_000, scaling=True):
    '''
    Purpose: preprocesses training dataset (with labels) and returns scaled features,
    labels and parameters to scale the test data set
    Input
        data_dir {string}: path to ~/Train data directory
        num_likes {int}: number of like_ids to keep as features
        scaling {boolean}: if True, Robust scaling applied to data; no scaling if False                
    Output:
        train_features {pandas DataFrame}: vectorized features scaled between 0 and 1
                for each user id in the training set, concatenated for all modalities
                (order = text + image + relation), with userid as DataFrame index.
        **(updated:)features_q10_q90 {tupple of 2 pandas Series}: series of 10th and 90th quantile values of
                text + image features from train dataset, to be used to scale test data.
                Note that the multihot relation features do not necessitate scaling.
        image_means {list of float}: means from oxford dataset to replace missing entries in oxford test set
        likes_kept {list of strings}: ordered likes_ids to serve as columns for test set relation features matrix
        train_labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.

    TO CONSIDER: convert outputted pandas to tensorflow tf.data.Dataset...
    https://www.tensorflow.org/guide/data
    '''
    # sub_ids: a numpy array of subject ids ordered alphabetically.
    # text_data: a pandas DataFrame of unscaled text data (liwc and nrc)
    sub_ids, text_data = get_text_data(data_dir)
    # image_data: pandas dataframe of oxford data
    # image_min_max: a tupple of 2 pandas series, the min and max values from oxford training features
    image_data_raw = get_image_raw(data_dir)
    image_means = image_data_raw.iloc[:, 2:].mean().tolist()
    image_data = get_image_clean(sub_ids, image_data_raw, image_means)

    '''
    Note: Scale the text and image data BEFORE concatenating with relations
    Update: scaling w RobustScaler rather than MinMaxScaler algo, due to outliers
    '''
    features_to_scale = pd.concat([text_data, image_data.iloc[:, :-2]], axis=1, sort=False)
    #feat_min = features_to_scale.min()
    #feat_max = features_to_scale.max()
    feat_q10 = features_to_scale.quantile(q = 0.10)
    feat_q90 = features_to_scale.quantile(q = 0.90)

    #feat_scaled = (features_to_scale - feat_min) / (feat_max - feat_min)
    #features_min_max = (feat_min, feat_max)
    if scaling:
        feat_scaled = (features_to_scale - feat_q10) / (feat_q90 - feat_q10)
    else:
        feat_scaled = features_to_scale
    features_q10_q90 = (feat_q10, feat_q90)

    #if DEBUG:
    #    likes_kept = [str(v) for v in range(num_likes)]
    #else:
    likes_kept = get_likes_kept(data_dir, num_likes)

    # multi-hot matrix of likes from train data
    likes_data = get_relations(data_dir, sub_ids, likes_kept)

    # concatenate all scaled features into a single DataFrame
    train_features = pd.concat([feat_scaled, image_data.iloc[:, -2:], likes_data], axis=1, sort=False)

    # DataFrame of training set labels
    train_labels = preprocess_labels(data_dir, sub_ids)

    #return train_features, features_min_max, image_means, likes_kept, train_labels
    return train_features, features_q10_q90, image_means, likes_kept, train_labels


#def preprocess_test(data_dir, min_max_train, image_means_train, likes_kept_train):
def preprocess_test(data_dir, q10_q90_train, image_means_train, likes_kept_train, scaling=True):
    '''
    Purpose: preprocesses test dataset (no labels)
    Input:
        datadir {string}: path to Test data directory
        (**updated)q10_q90_train {tupple of two numpy arrays}: 10th and 90th quantile values for
                concatenated text and image features (from train set)
        image_means_train {list of float}: means from oxford training dataset to replace
                missing entries in oxford test set
        likes_kept_train {list of strings}: most frequent likes_ids from train set
                (ordered by frequency) to serve as columns in relation features matrix
        scaling {boolean}: if True, Robust scaling applied to data; no scaling if False       
    Output:
        test_features {pandas DataFrame}: vectorized features of test set
    '''
    # sub_ids: a numpy array of subject ids ordered alphabetically.
    # text_data: a pandas DataFrame of unscaled text data (liwc and nrc)
    sub_ids, text_data = get_text_data(data_dir)

    # image_data: pandas dataframe of oxford data
    # image_min_max: a tupple of 2 pandas series, the min and max values from oxford training features
    image_data_raw = get_image_raw(data_dir)
    image_data = get_image_clean(sub_ids, image_data_raw, image_means_train)

    '''
    Note: Scale the text and image data BEFORE concatenating with relations
    '''
    features_to_scale = pd.concat([text_data, image_data.iloc[:, :-2]], axis=1, sort=False)
    #feat_min = min_max_train[0]
    #feat_max = min_max_train[1]
    feat_q10 = q10_q90_train[0]
    feat_q90 = q10_q90_train[1]

    #feat_scaled = (features_to_scale - feat_min) / (feat_max - feat_min)
    if scaling:
        feat_scaled = (features_to_scale - feat_q10) / (feat_q90 - feat_q10)
    else:
        feat_scaled = features_to_scale
    
    # multi-hot matrix of likes from train data
    likes_data = get_relations(data_dir, sub_ids, likes_kept_train)

    # concatenate all scaled features into a single DataFrame
    test_features = pd.concat([feat_scaled, image_data.iloc[:, -2:], likes_data], axis=1, sort=False)

    return test_features


def get_train_val_sets(features, labels, val_prop):
    '''
    Purpose: Splits training dataset into a train and a validation set of
    ratio determined by val_prop (x = features, y = labels)
    Input
        features {pandas DataFrame}: vectorized features scaled between 0 and 1
                for each user id in the training set, concatenated for all modalities
                (order = text + image + relation), with userid as DataFrame index.
        labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.
        val_prop {float between 0 and 1}: proportion of sample in validation set
                    (e.g. 0.2 = 20% validation, 80% training)
    Output:
        x_train, x_val {pandas DataFrames}: vectorized features for train and validation sets
        y_train, y_val {pandas DataFrames}: train and validation set labels

    TO DO: convert outputted pandas to tensorflow tf.data.Dataset?...
    https://www.tensorflow.org/guide/data
    '''
    # NOTE: UNUSED
    from sklearn import model_selection
    x_train, x_val, y_train, y_val = model_selection.train_test_split(
        features, # training features to split
        labels, # training labels to split
        test_size = val_prop, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
        shuffle= True,
        #stratify = y_data[:1],
        # random_state = 42  # can use to always obtain the same train/validation split
        )

    return x_train, x_val, y_train, y_val


In [11]:
# to preprocess the training dataset:
# 1. set path to Train directory
# 2. call preprocess_train

train_path = '../Train' #modify if working from other directory

train_features, features_q10_q90, image_means, likes_kept, train_labels = preprocess_train(train_path, num_likes=10_000, scaling=True)


In [12]:
# split training data into training and validation sets

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    train_features, # training features to split
    train_labels, # training labels to split
    test_size = 0.2, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
    shuffle= True,
    stratify = train_labels['gender']
    # random_state = 42  # can use to always obtain the same train/validation split
    )


In [13]:
x_train.head()

Unnamed: 0_level_0,WC,WPS,Sixltr,Dic,Numerals,funct,pronoun,ppron,i,we,...,10150131036435262,10150136203045543,10150138573815473,10150145087245298,10150145163490188,10150145214185538,10150147152190368,10150154095435553,10150157058260374,10150169313485249
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8e70cf8c8446f608dbeb35c2b617c4a3,0.568966,2.206897,0.189227,0.502027,0.235521,0.729072,0.395091,0.61117,0.557012,0.0,...,False,False,False,False,False,False,False,False,False,False
0eccafd741e8cccefd9dc6d5ef4a2b1c,0.12069,0.521121,1.136782,0.568063,0.84556,0.464901,0.195566,0.167545,0.296199,0.52518,...,False,False,False,False,False,False,False,False,False,False
6af85b08bd4f37094fa8fad58f08824f,0.603448,0.203592,0.991495,0.852095,0.467181,0.893926,1.009501,1.174921,0.152031,0.0,...,False,False,False,False,False,False,False,False,False,False
097e40a05270cfea6eb9337e6b136362,0.396552,0.139655,0.72927,0.69784,0.250965,0.546314,0.497229,0.532139,0.855832,0.0,...,False,False,False,False,False,False,False,False,False,False
54d202140878ea52e1f77d1bfc14a46f,0.637931,0.164943,0.627923,0.689937,0.0,0.731437,0.660333,0.589041,0.38401,0.0,...,False,False,False,False,False,False,False,False,False,False


In [16]:
 #### Gender Classification
    
# hyper-parameters for gender classifier

batch_size=64
num_layers=1
dense_units=32
activation='tanh'
optimizer='sgd'
learning_rate=0.005
l1_reg=0.005
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [17]:
# build gender model

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
likes_features = tf.keras.Input([num_like_pages], dtype=tf.bool, name="likes_features")

likes_float = tf.cast(likes_features, tf.float32)
likes_condensing_block = tf.keras.Sequential(name="likes_condensing_block")
for n_units in [256, 128, 64]:
    likes_condensing_block.add(tf.keras.layers.Dense(
        units=n_units,
        activation='tanh', #relu, tanh
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
    ))

condensed_likes = likes_condensing_block(likes_float)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        ))
        
    #if hparams.use_batchnorm:
    #    dense_layers.add(tf.keras.layers.BatchNormalization())
        
    #if hparams.use_dropout:
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers([text_features, image_features, condensed_likes])

gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)

model_gender = tf.keras.Model(
    inputs=[text_features, image_features, likes_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= gender
)    

model_gender.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    #loss_weights = 1.0, #needs to be a dictionnary... check doc for format
    metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Recall()]
)


In [24]:
# split data per modality to input model

x_txt = x_train.iloc[:, :91].values
x_img = x_train.iloc[:, 91:156].values
x_lik = x_train.iloc[:, 156:].values

history = model_gender.fit([x_txt, x_img, x_lik], y_train['gender'].values, shuffle=True, batch_size=128, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [26]:
x_txt_v = x_val.iloc[:, :91].values
x_img_v = x_val.iloc[:, 91:156].values
x_lik_v = x_val.iloc[:, 156:].values

score = model_gender.evaluate([x_txt_v, x_img_v, x_lik_v], y_val['gender'].values, verbose=1)




In [20]:
# hyper-parameters for age classifier

batch_size=64
num_layers=2
dense_units=64
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [21]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    train_features, # training features to split
    train_labels, # training labels to split
    test_size = 0.2, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
    shuffle= True,
    stratify = train_labels['age_group']
    # random_state = 42  # can use to always obtain the same train/validation split
    )


In [22]:
# calculating weights for age categories w sklearn
#https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html

a_weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train['age_group']), y= y_train['age_group'])

age_weights_dict = {}

for i in range(len(a_weights)):
    age_weights_dict[i] = a_weights[i]
    
print(age_weights_dict)    


{0: 0.4189636163175303, 1: 0.9890681936491411, 2: 2.272727272727273, 3: 6.1688311688311686}


In [23]:
# model category age

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
likes_features = tf.keras.Input([num_like_pages], dtype=tf.bool, name="likes_features")

likes_float = tf.cast(likes_features, tf.float32)
likes_condensing_block = tf.keras.Sequential(name="likes_condensing_block")
for n_units in [32]:
#for n_units in [256, 128, 64]:
    likes_condensing_block.add(tf.keras.layers.Dense(
        units=n_units,
        activation= 'tanh', #'tanh', #relu, tanh
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2_reg),
    ))

condensed_likes = likes_condensing_block(likes_float)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation= 'tanh', #'tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2_reg),        
        ))
        
    #if hparams.use_batchnorm:
    #    dense_layers.add(tf.keras.layers.BatchNormalization())
        
    #if hparams.use_dropout:
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers([text_features, image_features, condensed_likes])

#gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)
age_group = tf.keras.layers.Dense(units=4, activation="softmax", name="age_group")(features)

model_age = tf.keras.Model(
    inputs=[text_features, image_features, likes_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= age_group
)    

model_age.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    #loss_weights = age_weights_dict,
    metrics = ['acc', 'categorical_accuracy']
)


print(model_age.summary())


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
likes_features (InputLayer)     [(None, 10000)]      0                                            
__________________________________________________________________________________________________
tf_op_layer_Cast_2 (TensorFlowO [(None, 10000)]      0           likes_features[0][0]             
__________________________________________________________________________________________________
text_features (InputLayer)      [(None, 91)]         0                                            
__________________________________________________________________________________________________
image_features (InputLayer)     [(None, 65)]         0                                            
____________________________________________________________________________________________

In [28]:
x_txt = x_train.iloc[:, :91].values
x_img = x_train.iloc[:, 91:156].values
x_lik = x_train.iloc[:, 156:].values

y_train_age = tf.keras.utils.to_categorical(y_train['age_group'].values)

#history = model_age.fit([x_txt, x_img, x_lik], y_train_age, shuffle=True, batch_size=64, epochs=50, verbose=1, validation_split=0.2, class_weight=age_weights_dict)

history = model_age.fit([x_txt, x_img, x_lik], y_train_age, shuffle=True, batch_size=64, epochs=50, verbose=1, validation_split=0.2)



Train on 6080 samples, validate on 1520 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [29]:

x_txt_v = x_val.iloc[:, :91].values
x_img_v = x_val.iloc[:, 91:156].values
x_lik_v = x_val.iloc[:, 156:].values

y_val_age = tf.keras.utils.to_categorical(y_val['age_group'].values)

score = model_age.evaluate([x_txt_v, x_img_v, x_lik_v], y_val_age, verbose=1)





In [30]:
# baseline for age group:
y_val['age_group'].value_counts()[0]/y_val.shape[0]

0.5968421052631578

In [31]:
# hyper-parameters for age classifier without likes

batch_size=64
num_layers=2
dense_units=16
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [32]:
# model category age without likes

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
#likes_features = tf.keras.Input([num_like_pages], dtype=tf.bool, name="likes_features")

#likes_float = tf.cast(likes_features, tf.float32)
#likes_condensing_block = tf.keras.Sequential(name="likes_condensing_block")
#for n_units in [16]:
#for n_units in [256, 128, 64]:
#    likes_condensing_block.add(tf.keras.layers.Dense(
#        units=n_units,
#        activation='tanh', #relu, tanh
#        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
#        #kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2_reg),
#    ))

#condensed_likes = likes_condensing_block(likes_float)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2_reg),        
        ))
        
    #if hparams.use_batchnorm:
    #    dense_layers.add(tf.keras.layers.BatchNormalization())
        
    #if hparams.use_dropout:
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
#features = dense_layers([text_features, image_features, condensed_likes])
features = dense_layers([text_features, image_features])

#gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)
age_group = tf.keras.layers.Dense(units=4, activation="softmax", name="age_group")(features)

model_age = tf.keras.Model(
    inputs=[text_features, image_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= age_group
)    

model_age.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    #loss_weights = age_weights_dict,
    metrics = ['acc', 'categorical_accuracy']
)


print(model_age.summary())


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_features (InputLayer)      [(None, 91)]         0                                            
__________________________________________________________________________________________________
image_features (InputLayer)     [(None, 65)]         0                                            
__________________________________________________________________________________________________
dense_layers (Sequential)       (None, 16)           2784        text_features[0][0]              
                                                                 image_features[0][0]             
__________________________________________________________________________________________________
age_group (Dense)               (None, 4)            68          dense_layers[0][0]         

In [34]:
# age model without likes

x_txt = x_train.iloc[:, :91].values
x_img = x_train.iloc[:, 91:156].values
#x_lik = x_train.iloc[:, 156:].values

y_train_age = tf.keras.utils.to_categorical(y_train['age_group'].values)

#history = model_age.fit([x_txt, x_img, x_lik], y_train_age, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2, class_weight=age_weights_dict)
#history = model_age.fit([x_txt, x_img], y_train_age, shuffle=True, batch_size=128, epochs=50, verbose=1, validation_split=0.2, class_weight=age_weights_dict)

history = model_age.fit([x_txt, x_img], y_train_age, shuffle=True, batch_size=128, epochs=50, verbose=1, validation_split=0.2)




Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [35]:
x_txt_v = x_val.iloc[:, :91].values
x_img_v = x_val.iloc[:, 91:156].values
#x_lik_v = x_val.iloc[:, 156:].values

y_val_age = tf.keras.utils.to_categorical(y_val['age_group'].values)

#score = model_age.evaluate([x_txt_v, x_img_v, x_lik_v], y_val_age, verbose=1)
score = model_age.evaluate([x_txt_v, x_img_v], y_val_age, verbose=1)



In [29]:
# hyper-parameters for personality classifier

batch_size=64
num_layers=3
dense_units=32
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [30]:
# model to predict a personality trait

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
likes_features = tf.keras.Input([num_like_pages], dtype=tf.bool, name="likes_features")

likes_float = tf.cast(likes_features, tf.float32)
likes_condensing_block = tf.keras.Sequential(name="likes_condensing_block")
for n_units in [32, 16]:
    likes_condensing_block.add(tf.keras.layers.Dense(
        units=n_units,
        activation='tanh', #relu, tanh
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
    ))

condensed_likes = likes_condensing_block(likes_float)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    #if hparams.use_batchnorm:
    #    dense_layers.add(tf.keras.layers.BatchNormalization())
        
    #if hparams.use_dropout:
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers([text_features, image_features, condensed_likes])

#gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)
def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
#con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
#con = personality_scaling("con")(con_sigmoid)

model_perso = tf.keras.Model(
    inputs=[text_features, image_features, likes_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= ext
)    

model_perso.compile(
    optimizer = 'adam',
    loss = 'mse',
    #loss_weights = 1.0, #needs to be a dictionnary... check doc for format
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [31]:
# personality classifier

x_txt = x_train.iloc[:, :91].values
x_img = x_train.iloc[:, 91:156].values
x_lik = x_train.iloc[:, 156:].values

#'ope', 'con', 'ext', 'agr', 'neu'
#history = model_perso.fit([x_txt, x_img, x_lik], y_train['ext'].values, batch_size=7600, epochs=1, verbose=1, validation_split=0.2)

history = model_perso.fit([x_txt, x_img, x_lik], y_train['ext'].values, batch_size=128, epochs=25, verbose=1, validation_split=0.2)



Train on 6080 samples, validate on 1520 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [32]:
x_txt_v = x_val.iloc[:, :91].values
x_img_v = x_val.iloc[:, 91:156].values
x_lik_v = x_val.iloc[:, 156:].values

score = model_perso.evaluate([x_txt_v, x_img_v, x_lik_v], y_val['ext'].values, verbose=1)




In [36]:
# hyper-parameters for personality classifier without likes

batch_size=64
num_layers=1
dense_units=32
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000



In [37]:
# model to predict a personality trait without likes

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
#likes_features = tf.keras.Input([num_like_pages], dtype=tf.bool, name="likes_features")

#likes_float = tf.cast(likes_features, tf.float32)
#likes_condensing_block = tf.keras.Sequential(name="likes_condensing_block")
#for n_units in [32, 16]:
#    likes_condensing_block.add(tf.keras.layers.Dense(
#        units=n_units,
#        activation='tanh', #relu, tanh
#        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
#        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
#    ))

#condensed_likes = likes_condensing_block(likes_float)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    #if hparams.use_batchnorm:
    #    dense_layers.add(tf.keras.layers.BatchNormalization())
        
    #if hparams.use_dropout:
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
#features = dense_layers([text_features, image_features, condensed_likes])
features = dense_layers([text_features, image_features])

#gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)
def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
con = personality_scaling("con")(con_sigmoid)

model_perso = tf.keras.Model(
    inputs=[text_features, image_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= con
)    

model_perso.compile(
    optimizer = 'adam',
    loss = 'mse',
    #loss_weights = 1.0, #needs to be a dictionnary... check doc for format
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [38]:
# personality classifier

x_txt = x_train.iloc[:, :91].values
x_img = x_train.iloc[:, 91:156].values
#x_lik = x_train.iloc[:, 156:].values

#'ope', 'con', 'ext', 'agr', 'neu'

history = model_perso.fit([x_txt, x_img], y_train['con'].values, batch_size=32, epochs=25, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [39]:
x_txt_v = x_val.iloc[:, :91].values
x_img_v = x_val.iloc[:, 91:156].values
#x_lik_v = x_val.iloc[:, 156:].values

#'ope', 'con', 'ext', 'agr', 'neu'

score = model_perso.evaluate([x_txt_v, x_img_v], y_val['con'].values, verbose=1)




In [41]:
print(y_train['con'].mean())
con_rmse = np.sqrt(np.mean((y_val['con'] - y_train['con'].mean())**2))
con_rmse

3.4387907894736838


0.7212658282119913

In [42]:
y_val.shape

(1900, 8)

In [43]:
y_val['con']

userid
29c0092c69d11a04fbed7ba8f1db8e02    4.00
0be33289ee687c5767deb7bd57db87ee    3.50
19ff6d613a39ed66cb9e533c49e93b52    4.25
fbb9331f4415d19775e1bc72137c1f3d    2.90
b3fafed766d54c1154cd714cfe4af175    4.25
                                    ... 
c33224f8546c2904d83033bf42fe5571    3.25
1b022fed34436e3a6f39a6c28bdd4f26    3.25
50011bcbe71fe8cad36c217505d67469    4.00
310f3a49bee4446ebad1977558a919de    3.50
c6be4b2c9902c76ba6274e161ed9b16e    3.55
Name: con, Length: 1900, dtype: float64

In [44]:
# hyper-parameters for personality classifier with text features only

batch_size=64
num_layers=1
dense_units=8
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000




In [57]:
# model to predict a personality trait without likes

text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
#dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
#features = dense_layers([text_features, image_features, condensed_likes])
features = dense_layers(text_features)

#gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)
def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
con = personality_scaling("con")(con_sigmoid)

model_perso = tf.keras.Model(
    inputs=[text_features, image_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= con
)    

model_perso.compile(
    optimizer = 'adam',
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [58]:
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history = model_perso.fit([x_txt, x_img], y_train['con'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)



Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [59]:
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score = model_perso.evaluate([x_txt_v, x_img_v], y_val['con'].values, verbose=1)

con_rmse = np.sqrt(np.mean((y_val['con'] - y_train['con'].mean())**2))
con_rmse



0.7212658282119913