In [54]:
%matplotlib inline

In [55]:
import os
import glob

import numpy as np
import pandas as pd

import tensorflow as tf

from typing import *
from collections import Counter

import sklearn
from sklearn import model_selection


In [56]:

def get_text_data(input_dir):
    """
    Purpose: preprocess liwc and nrc
    Input
        input_dir {string} : path to input_directory (ex, "~/Train")
    Output:
        id_list {numpy array of strings}: array of user ids sorted alphabetically,
                                        to determine order of features and labels DataFrames
        text_data {pandas DataFrame of float}: unscaled text data (liwc and nrc combined)
    """
    # Load and sort text data
    liwc = pd.read_csv(os.path.join(input_dir, "Text", "liwc.csv"), sep = ',')
    liwc = liwc.sort_values(by=['userId'])

    nrc = pd.read_csv(os.path.join(input_dir, "Text", "nrc.csv"), sep = ',')
    nrc = nrc.sort_values(by=['userId'])

    # Build list of subject ids ordered alphabetically
    # Check if same subject lists in both sorted DataFrames (liwc and nrc)
    if np.array_equal(liwc['userId'], nrc['userId']):
        id_list = liwc['userId'].to_numpy()
    else:
        raise Exception('userIds do not match between liwc and nrc data')

    # merge liwc and nrc DataFrames using userId as index
    liwc.set_index('userId', inplace=True)
    nrc.set_index('userId', inplace=True)

    text_data = pd.concat([liwc, nrc], axis=1, sort=False)

    return id_list, text_data


def get_image_clean(sub_ids, oxford, means):
    '''
    Purpose: preprocess oxford metrics derived from profile pictures (part 2)
    Input:
        sub_ids {numpy array of strings}: ordered list of userIDs
        oxford {pandas DataFrame of floats}: unscaled oxford features of users with 1+ face
        means {list of float}: mean values for each feature averaged from train set,
                    to replace missing values for userids with no face (train and test set)
    Output:
        image_data {pandas DataFrame of float}: unscaled oxford image data
                with mean values replacing missing entries
    '''
    # list of ids with at least one face on image: 7174 out of 9500 in train set
    ox_list = np.sort(oxford['userId'].unique(), axis=None)
    # list of ids in text_list who have no face metrics in oxford.csv (2326 in train set)
    ox_noface = np.setdiff1d(sub_ids, ox_list)

    # Create DataFrame for userids with no face (1 row per userid)
    # values are mean metrics averaged from users with entries (training set)
    ox_nf = pd.DataFrame(ox_noface, columns = ['userId'])
    columns = oxford.columns[2:].tolist()
    for column, mean in zip(columns, means):
        ox_nf.insert(loc=ox_nf.shape[1], column=column, value=mean, allow_duplicates=True)
    # insert column 'noface' = 1 if no face in image, else 0
    ox_nf.insert(loc=ox_nf.shape[1], column='noface', value=1, allow_duplicates=True)
    # insert column 'multiface' = 1 if many faces in image, else 0
    ox_nf.insert(loc=ox_nf.shape[1], column='multiface', value=0, allow_duplicates=True)
    ox_nf.set_index('userId', inplace=True)

    # Format DataFrame from userids with 1+ face
    # insert column 'noface' = 1 if no face in image, else 0
    oxford.insert(loc=oxford.shape[1], column='noface', value=0, allow_duplicates=True)
    # list userIds with multiple faces (714 in train set)
    ox_multiples = oxford['userId'][oxford['userId'].duplicated()].tolist()
    # insert column 'multiface' = 1 if many faces in image, else 0
    oxford.insert(loc=oxford.shape[1], column='multiface', value=0, allow_duplicates=True)
    multi_mask = pd.Series([uid in ox_multiples for uid in oxford['userId']])
    i = oxford[multi_mask].index
    oxford.loc[i, 'multiface'] = 1
    # drop duplicate entries with same userId (keep first entry per userId)
    oxford.drop_duplicates(subset ='userId', keep='first', inplace=True)

    # merge the two DataFrames
    oxford.drop(['faceID'], axis=1, inplace=True)
    oxford.set_index('userId', inplace=True)
    image_data = pd.concat([ox_nf, oxford], axis=0, sort=False).sort_values(by=['userId'])

    if not np.array_equal(image_data.index, sub_ids):
        raise Exception('userIds do not match between oxford file and id list')

    return image_data


def get_image_raw(data_dir):
    '''
    Purpose: preprocess oxford metrics derived from profile pictures (part 1)
    Input
        input_dir {string} : path to input_directory (ex, "~/Train")
    Output:
        image_data {pandas DataFrame of float}: unscaled oxford image data
    '''
    # Load data of oxford features extracted from profile picture (face metrics)
    # 7915 entries; some users have no face, some have multiple faces on image.
    # userids with 1+ face on image: 7174 out of 9500 (train set)
    # duplicated entries (userids with > 1 face on same image): 741 in train set
    oxford = pd.read_csv(os.path.join(data_dir, "Image", "oxford.csv"), sep = ',')
    #oxford = oxford.sort_values(by=['userId'])
    '''
    NOTE: headPose_pitch has NO RANGE, drop that feature
    '''
    oxford.drop(['headPose_pitch'], axis=1, inplace=True)

    return oxford


def get_likes_kept(data_dir, num_features) -> List[str]:
    '''
    Purpose: get list of likes to keep as features
    Input:
        data_dir {str} : the parent input directory
        num_features {int} : the number of likes to keep as features,
                        starting from those with highest frequencies
    Output:
        freq_like_id {List of strings}: frequency of most frequent likes,
                    (number = num_features), in descending ordered, indexed by like_id
    '''
    #Why return frequency?
    relation = pd.read_csv(os.path.join(data_dir, "Relation", "Relation.csv")) #, index_col=1)
    relation = relation.drop(['Unnamed: 0'], axis=1)
    like_ids_to_keep = relation['like_id'].value_counts(sort=True, ascending=False)[:num_features] #This sorts features by frequency

    #sort like indices (which are the keys associated with the values kepts)
    likes_int64_list = sorted(like_ids_to_keep.keys()) # This sorts indices by like_id
    likes_str_list = [str(l) for l in likes_int64_list]
    return likes_str_list


def get_relations(data_dir: str, sub_ids: List[str], like_ids_to_keep: List[str]):
    '''
    Purpose: preprocess relations dataset ('likes')

    Input:
        data_dir {str} -- the parent input directory
        sub_ids {numpy array of strings} -- the ordered list of userids
        like_ids_to_keep {List[str]} -- The list of page IDs to keep.

    Returns:
        relations_data -- multihot matrix of the like_id. Rows are indexed with userid, entries are boolean.
    '''
    relation = pd.read_csv(os.path.join(data_dir, "Relation", "Relation.csv")) #, index_col=1)
    relation = relation.drop(['Unnamed: 0'], axis=1)

    ## One HUGE step:
    # likes_to_keep = like_ids_to_keep.keys()
    # kept_relations = relation[relation.like_id.isin(likes_to_keep)]
    # multi_hot_relations = pd.get_dummies(kept_relations, columns=["like_id"], prefix="")
    # multi_hot = multi_hot_relations.groupby(("userid")).sum()
    # return multi_hot_relations
    ###
    total_num_pages = len(like_ids_to_keep)
    # Create a multihot likes matrix of booleans (rows = userids, cols = likes), by batch
    batch_size = 1000

    # Create empty DataFrame with sub_ids as index list
    relation_data = pd.DataFrame(sub_ids, columns = ['userid'])
    relation_data.set_index('userid', inplace=True)

    for start_index in range(0, total_num_pages, batch_size):
        end_index = min(start_index + batch_size, total_num_pages)

        # sets are better for membership testing than lists.
        like_ids_for_this_batch = set(like_ids_to_keep[start_index:end_index])

        filtered_table = relation[relation['like_id'].isin(like_ids_for_this_batch)]
        ## THIS is the slow part:
        relHot = pd.get_dummies(filtered_table, columns=['like_id'], prefix="", prefix_sep="")
        ##
        relHot = relHot.groupby(['userid']).sum().astype(float) # this makes userid the index

        relation_data = pd.concat([relation_data, relHot], axis=1, sort=True)

    relation_data = relation_data.reindex(like_ids_to_keep, axis=1)
    relation_data.fillna(0.0, inplace=True)
    relation_data = relation_data.astype("bool")

    # will be different if users in relation.csv are not in sub_ids
    if not np.array_equal(relation_data.index, sub_ids):
        raise Exception(f"""userIds do not match between relation file and id list:
    {relation_data.index}
    {sub_ids}

    """)

    return relation_data

def get_likes_lists(likes_data, max_num_likes):
    '''
    Purpose: make list of lists of indices of liked pages per user
    Input:
        likes_data {pandas DataFrame}: multihot matrix of the like_id. Rows are indexed with userid, entries are boolean
    Output:
        lists_of_likes {list of lists of int}: indices of pages liked by each user,
                padded with zeros to lenght = max_num_likes

    '''
    # create list of lists of indices (one per user) corresponding to liked pages in one-hot matrix
    index_lists = []
    for index in likes_data.index:
        likes_indices = np.nonzero(likes_data.loc[index].to_numpy())[0].tolist()
        index_lists.append(likes_indices)

    # pad each list of indices with 0s to set lenght = max_num_likes
    lists_padded = tf.keras.preprocessing.sequence.pad_sequences(index_lists,
    padding='post', maxlen=max_num_likes)

    lists_of_likes = pd.DataFrame(lists_padded)

    lists_of_likes.insert(loc=lists_of_likes.shape[1], column='userid', value=likes_data.index, allow_duplicates=True)
    lists_of_likes.set_index('userid', inplace=True)

    return lists_of_likes

def make_label_dict(labels):
    '''
    Purpose: make dictionnary of labels from pandas DataFrame
    Input:
        labels {pandas DataFrame}: labels ordered per userids (alphabetical order)
    Output:
        labels_dict {dictionary of pandas DataFrames}: labels (one entry per metric) ordered alphabetically
                by userid for the training set, with userids as index.

    '''
    gender = labels['gender']

    age_grps = labels[['age_xx_24', 'age_25_34', 'age_35_49', 'age_50_xx']]

    '''
    Note: : each DataFrames (value) is indexed by userid in labels_dict
    '''
    labels_dict = {}
    labels_dict['userid'] = labels.index
    labels_dict['gender'] = gender
    labels_dict['age_grps'] = age_grps
    labels_dict['ope'] = labels['ope']
    labels_dict['con'] = labels['con']
    labels_dict['ext'] = labels['ext']
    labels_dict['agr'] = labels['agr']
    labels_dict['neu'] = labels['neu']

    return labels_dict


def preprocess_labels(data_dir, sub_ids):
    '''
    Purpose: preprocess entry labels from training set
    Input:
        datadir {string} : path to training data directory
        sub_ids {numpy array of strings}: list of subject ids ordered alphabetically
    Output:
        labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.

    '''
    labels = pd.read_csv(os.path.join(data_dir, "Profile", "Profile.csv"))

    def age_group_id(age_str: str) -> int:
        """Returns the age group category ID (an integer from 0 to 3) for the given age (string)

        Arguments:
            age_str {str} -- the age

        Returns:
            int -- the ID of the age group: 0 for xx-24, 1 for 25-34, 2 for 35-49 and 3 for 50-xx.
        """
        age = int(age_str)
        if age <= 24:
            return 0
        elif age <= 34:
            return 1
        elif age <= 49:
            return 2
        else:
            return 3

    labels = labels.assign(age_group = lambda dt: pd.Series([age_group_id(age_str) for age_str in dt["age"]]))
    # labels = labels.assign(age_xx_24 = lambda dt: pd.Series([int(age) <= 24 for age in dt["age"]]))
    # labels = labels.assign(age_25_34 = lambda dt: pd.Series([25 <= int(age) <= 34 for age in dt["age"]]))
    # labels = labels.assign(age_35_49 = lambda dt: pd.Series([35 <= int(age) <= 49 for age in dt["age"]]))
    # labels = labels.assign(age_50_xx = lambda dt: pd.Series([50 <= int(age) for age in dt["age"]]))

    labels = labels.sort_values(by=['userid'])
    # check if same subject ids in labels and sub_ids
    if not np.array_equal(labels['userid'].to_numpy(), sub_ids):
        raise Exception('userIds do not match between profiles labels and id list')

    labels = labels.drop(['Unnamed: 0'], axis=1)
    labels.set_index('userid', inplace=True)

    return labels


def preprocess_train(data_dir, num_likes=10_000, max_num_likes=2145):
    '''
    Purpose: preprocesses training dataset (with labels) and returns scaled features,
    labels and parameters to scale the test data set
    Input
        data_dir {string}: path to ~/Train data directory
        num_likes {int}: number of like_ids to keep as features
        max_num_likes {int}: maximum number of pages liked by a single user
    Output:
        train_features {pandas DataFrame}: vectorized features scaled between 0 and 1
                for each user id in the training set, concatenated for all modalities
                (order = text + image + relation), with userid as DataFrame index.
        **(updated:)features_q10_q90 {tupple of 2 pandas Series}: series of 10th and 90th quantile values of
                text + image features from train dataset, to be used to scale test data.
                Note that the multihot relation features do not necessitate scaling.
        image_means {list of float}: means from oxford dataset to replace missing entries in oxford test set
        likes_kept {list of strings}: ordered likes_ids to serve as columns for test set relation features matrix
        train_labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.

    TO CONSIDER: convert outputted pandas to tensorflow tf.data.Dataset...
    https://www.tensorflow.org/guide/data
    '''
    # sub_ids: a numpy array of subject ids ordered alphabetically.
    # text_data: a pandas DataFrame of unscaled text data (liwc and nrc)
    sub_ids, text_data = get_text_data(data_dir)
    # image_data: pandas dataframe of oxford data
    # image_min_max: a tupple of 2 pandas series, the min and max values from oxford training features
    image_data_raw = get_image_raw(data_dir)
    image_means = image_data_raw.iloc[:, 2:].mean().tolist()
    image_data = get_image_clean(sub_ids, image_data_raw, image_means)

    '''
    Note: Scale the text and image data BEFORE concatenating with relations
    Update: scaling w RobustScaler rather than MinMaxScaler algo, due to outliers
    '''
    features_to_scale = pd.concat([text_data, image_data.iloc[:, :-2]], axis=1, sort=False)
    #feat_min = features_to_scale.min()
    #feat_max = features_to_scale.max()
    feat_q10 = features_to_scale.quantile(q = 0.10)
    feat_q90 = features_to_scale.quantile(q = 0.90)

    #feat_scaled = (features_to_scale - feat_min) / (feat_max - feat_min)
    #features_min_max = (feat_min, feat_max)
    feat_scaled = (features_to_scale - feat_q10) / (feat_q90 - feat_q10)
    features_q10_q90 = (feat_q10, feat_q90)

    likes_kept = get_likes_kept(data_dir, num_likes)

    # multi-hot matrix of likes from train data
    likes_data = get_relations(data_dir, sub_ids, likes_kept)

    train_likes_lists = get_likes_lists(likes_data, max_num_likes)

    # concatenate all scaled features into a single DataFrame
    additional_weird_features = image_data.iloc[:, -2:]
    train_features = pd.concat([feat_scaled, additional_weird_features, train_likes_lists], axis=1, sort=False)

    # DataFrame of training set labels
    train_labels = preprocess_labels(data_dir, sub_ids)


    #return train_features, features_min_max, image_means, likes_kept, train_labels
    return train_features, features_q10_q90, image_means, likes_kept, train_labels


#def preprocess_test(data_dir, min_max_train, image_means_train, likes_kept_train):
def preprocess_test(data_dir, q10_q90_train, image_means_train, likes_kept_train, max_num_likes=2145):
    '''
    Purpose: preprocesses test dataset (no labels)
    Input:
        datadir {string}: path to Test data directory
        (**updated)q10_q90_train {tupple of two numpy arrays}: 10th and 90th quantile values for
                concatenated text and image features (from train set)
        image_means_train {list of float}: means from oxford training dataset to replace
                missing entries in oxford test set
        likes_kept_train {list of strings}: most frequent likes_ids from train set
                (ordered by frequency) to serve as columns in relation features matrix
        max_num_likes {int}: maximum number of pages liked by a single user (from train set)
    Output:
        test_features {pandas DataFrame}: vectorized features of test set

    '''
    # sub_ids: a numpy array of subject ids ordered alphabetically.
    # text_data: a pandas DataFrame of unscaled text data (liwc and nrc)
    sub_ids, text_data = get_text_data(data_dir)

    # image_data: pandas dataframe of oxford data
    # image_min_max: a tupple of 2 pandas series, the min and max values from oxford training features
    image_data_raw = get_image_raw(data_dir)
    image_data = get_image_clean(sub_ids, image_data_raw, image_means_train)

    '''
    Note: Scale the text and image data BEFORE concatenating with relations
    '''
    features_to_scale = pd.concat([text_data, image_data.iloc[:, :-2]], axis=1, sort=False)
    #feat_min = min_max_train[0]
    #feat_max = min_max_train[1]
    feat_q10 = q10_q90_train[0]
    feat_q90 = q10_q90_train[1]

    #feat_scaled = (features_to_scale - feat_min) / (feat_max - feat_min)
    feat_scaled = (features_to_scale - feat_q10) / (feat_q90 - feat_q10)

    # multi-hot matrix of likes from train data
    likes_data = get_relations(data_dir, sub_ids, likes_kept_train)

    # list of lists of indices corresponding to pages liked
    # each padded with 0s (list's max length = max_num_likes)
    test_likes_lists = get_likes_lists(likes_data, max_num_likes)

    # concatenate all scaled features into a single DataFrame
    test_features = pd.concat([feat_scaled, image_data.iloc[:, -2:], test_likes_lists], axis=1, sort=False)

    return test_features


def get_train_val_sets(features, labels, val_prop):
    '''
    Purpose: Splits training dataset into a train and a validation set of
    ratio determined by val_prop (x = features, y = labels)
    Input
        features {pandas DataFrame}: vectorized features scaled between 0 and 1
                for each user id in the training set, concatenated for all modalities
                (order = text + image + relation), with userid as DataFrame index.
        labels {pandas DataFrame}: labels ordered by userid (alphabetically)
                for the training set, with userids as index.
        val_prop {float between 0 and 1}: proportion of sample in validation set
                    (e.g. 0.2 = 20% validation, 80% training)
    Output:
        x_train, x_val {pandas DataFrames}: vectorized features for train and validation sets
        y_train, y_val {pandas DataFrames}: train and validation set labels

    TO DO: convert outputted pandas to tensorflow tf.data.Dataset?...
    https://www.tensorflow.org/guide/data
    '''
    # NOTE: UNUSED
    from sklearn import model_selection
    x_train, x_val, y_train, y_val = model_selection.train_test_split(
        features, # training features to split
        labels, # training labels to split
        test_size = val_prop, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
        shuffle= True,
        #stratify = y_data[:1],
        # random_state = 42  # can use to always obtain the same train/validation split
        )

    return x_train, x_val, y_train, y_val


In [57]:
# to preprocess the training dataset:
# 1. set path to Train directory
# 2. call preprocess_train

train_path = '../Train' #modify if working from other directory

train_features, features_q10_q90, image_means, likes_kept, train_labels = preprocess_train(train_path, num_likes=10_000, max_num_likes=2145)


In [None]:
## INSERT: save (as .csv) the features needed as arguments for preprocess_test in submission; 
# load in test script to feed model
# save tensorflow models in submissions
# https://www.tensorflow.org/guide/saved_model

In [58]:
test_features = preprocess_test(train_path, features_q10_q90, image_means, likes_kept, max_num_likes=2145)


In [59]:
np.sum(train_features.columns == test_features.columns)/test_features.shape[1]

1.0

In [60]:
# For GENDER: split training data into training and validation sets

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    train_features, # training features to split
    train_labels, # training labels to split
    test_size = 0.2, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
    shuffle= True,
    stratify = train_labels['gender']
    # random_state = 42  # can use to always obtain the same train/validation split
    )


In [61]:
x_train.head()

Unnamed: 0,WC,WPS,Sixltr,Dic,Numerals,funct,pronoun,ppron,i,we,...,2135,2136,2137,2138,2139,2140,2141,2142,2143,2144
279b998c643287bc9203863c1cc8b518,0.603448,0.004454,0.862509,0.805691,0.0,0.668603,0.721298,0.791359,0.866317,0.0,...,0,0,0,0,0,0,0,0,0,0
68e6966dc2e2ac6586fc80ac46a6c70f,1.0,0.202586,0.259391,1.141225,0.204633,1.181744,1.133808,1.051633,0.610747,0.381295,...,0,0,0,0,0,0,0,0,0,0
186df19f865ca34d27c916104eaec120,0.896552,0.126437,0.378455,0.939038,0.42471,0.78515,0.052257,0.227608,0.49017,0.395683,...,0,0,0,0,0,0,0,0,0,0
2e9e3b192d396ccd8e652df1e99f4143,0.931034,0.242672,0.286322,0.88779,0.208494,0.932437,0.65004,0.336143,0.12713,0.784173,...,0,0,0,0,0,0,0,0,0,0
33ac4a867d7b79b2a09042c7e7f6bfa0,0.637931,0.104885,0.71297,0.751128,1.15444,0.529086,0.471101,0.71549,0.619921,0.0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# GENDER MODEL
# # hyper-parameters for gender classifier w likes embedding

batch_size=64
num_layers=1
dense_units=32
activation='tanh'
optimizer='sgd'
learning_rate=0.0005
l1_reg=0.005
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [63]:
# model gender using embedding for likes

max_len = 2145

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
likes_features = tf.keras.Input([max_len], dtype=tf.int32, name="likes_features")

likes_embedding_block = tf.keras.Sequential(name="likes_embedding_block")
likes_embedding_block.add(tf.keras.layers.Embedding(10000, 8, input_length=max_len))
likes_embedding_block.add(tf.keras.layers.Flatten())

condensed_likes = likes_embedding_block(likes_features)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation= 'tanh', #'tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),      
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers([text_features, image_features, condensed_likes])

gender = tf.keras.layers.Dense(units=1, activation="sigmoid", name="gender")(features)

model_gender = tf.keras.Model(
    inputs=[text_features, image_features, likes_features],
    #outputs=[age_group, gender, ext, ope, agr, neu, con]
    outputs= gender
)    

model_gender.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),    
    loss = 'binary_crossentropy',
    #loss_weights = 1.0, #needs to be a dictionnary... check doc for format
    metrics = [tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Recall()]
)

print(model_gender.summary())


Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
likes_features (InputLayer)     [(None, 2145)]       0                                            
__________________________________________________________________________________________________
text_features (InputLayer)      [(None, 91)]         0                                            
__________________________________________________________________________________________________
image_features (InputLayer)     [(None, 65)]         0                                            
__________________________________________________________________________________________________
likes_embedding_block (Sequenti (None, 17160)        80000       likes_features[0][0]             
___________________________________________________________________________________________

In [64]:
x_train_txt = x_train.iloc[:, :91].values
x_train_img = x_train.iloc[:, 91:156].values
x_train_lik = x_train.iloc[:, 156:].values

history_gender = model_gender.fit([x_train_txt, x_train_img, x_train_lik], y_train['gender'].values, shuffle=True, batch_size=64, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [65]:
x_val_txt = x_val.iloc[:, :91].values
x_val_img = x_val.iloc[:, 91:156].values
x_val_lik = x_val.iloc[:, 156:].values

score_gender = model_gender.evaluate([x_val_txt, x_val_img, x_val_lik], y_val['gender'].values, verbose=1)

# baseline for age group:
y_val['gender'].value_counts()[1]/y_val.shape[0]




0.5773684210526315

In [115]:
#https://www.tensorflow.org/guide/keras/save_and_serialize

#!mkdir saved_models
!ls
#model_gender.save('saved_models/gender_model_embedding.h5')


NB_regression.ipynb	     model_tests_4.ipynb  x_train.csv
Preprocessing_pourIsa.ipynb  model_tests_5.ipynb  y_train.csv
model_tests_2.ipynb	     saved_models
model_tests_3.ipynb	     ttests_gender.csv


In [66]:
# Split train and test set with balanced Age Groups or Gender (for personality classifiers)

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    train_features, # training features to split
    train_labels, # training labels to split
    test_size = 0.2, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
    shuffle= True,
    stratify = train_labels['age_group'] # can use gender too
    # random_state = 42  # can use to always obtain the same train/validation split
    )


In [67]:
# hyper-parameters for personality classifier with text features only

batch_size=64
num_layers=1
dense_units=8
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

gender_loss_weight = 1.0
age_loss_weight = 1.0

age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
num_like_pages = 10000


In [68]:
# model to predict a personality trait with text only 

# Ope model
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers(text_features)

def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
#con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
#con = personality_scaling("con")(con_sigmoid)

model_ope = tf.keras.Model(
    inputs= text_features,
    outputs= ope
)    

model_ope.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),  
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [69]:
# Ope model train
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history_ope = model_ope.fit(x_txt, y_train['ope'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [70]:
# Ope model test
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score_ope = model_ope.evaluate(x_txt_v, y_val['ope'].values, verbose=1)

ope_rmse = np.sqrt(np.mean((y_val['ope'] - y_train['ope'].mean())**2))
ope_rmse




0.6235189956610291

In [135]:
model_ope.save('saved_models/ope_model.h5')


y_train.csv	   model_tests_5.ipynb	Preprocessing_pourIsa.ipynb
x_train.csv	   model_tests_4.ipynb	NB_regression.ipynb
ttests_gender.csv  model_tests_3.ipynb
saved_models	   model_tests_2.ipynb


In [71]:
# model to predict a personality trait with text only 

# Con model
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers(text_features)

def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
con = personality_scaling("con")(con_sigmoid)

model_con = tf.keras.Model(
    inputs= text_features,
    outputs= con
)    

model_con.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),  
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [72]:
# Con model train
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history_con = model_con.fit(x_txt, y_train['con'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [73]:
# Con model test
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score_con = model_con.evaluate(x_txt_v, y_val['con'].values, verbose=1)

con_rmse = np.sqrt(np.mean((y_val['con'] - y_train['con'].mean())**2))
con_rmse




0.7135370266073664

In [125]:
model_con.save('saved_models/con_model.h5')


In [74]:
# model to predict a personality trait with text only 

# Ext model
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers(text_features)

def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
#con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
#con = personality_scaling("con")(con_sigmoid)

model_ext = tf.keras.Model(
    inputs= text_features,
    outputs= ext
)    

model_ext.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),  
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)



In [75]:
# Ext model train
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history_ext = model_ext.fit(x_txt, y_train['ext'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [76]:
# Ext model test
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score_ext = model_ext.evaluate(x_txt_v, y_val['ext'].values, verbose=1)

ext_rmse = np.sqrt(np.mean((y_val['ext'] - y_train['ext'].mean())**2))
ext_rmse




0.7998928590867689

In [124]:
model_ext.save('saved_models/ext_model.h5')


In [77]:
# model to predict a personality trait with text only 

# Agr model
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers(text_features)

def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
agr = personality_scaling("agr")(agr_sigmoid)
    
#neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
#neu = personality_scaling("neu")(neu_sigmoid)
    
#con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
#con = personality_scaling("con")(con_sigmoid)

model_agr = tf.keras.Model(
    inputs= text_features,
    outputs= agr
)    

model_agr.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),  
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)


In [78]:
# Agr model train
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history_agr = model_agr.fit(x_txt, y_train['agr'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [79]:
# Agr model test
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score_agr = model_agr.evaluate(x_txt_v, y_val['agr'].values, verbose=1)

agr_rmse = np.sqrt(np.mean((y_val['agr'] - y_train['agr'].mean())**2))
agr_rmse




0.6592729203387896

In [123]:
model_agr.save('saved_models/agr_model.h5')


In [80]:
# model to predict a personality trait with text only 

# Neu model
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")

dense_layers = tf.keras.Sequential(name="dense_layers")
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation='tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),
        #kernel_regularizer=tf.keras.regularizers.l2(l2_reg),
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers(text_features)

def personality_scaling(name: str) -> tf.keras.layers.Layer:
    """Returns a layer that scales a sigmoid output [0, 1) output to the desired 'personality' range of [1, 5)
        
    Arguments:
        name {str} -- the name to give to the layer.
        
    Returns:
        tf.keras.layers.Layer -- the layer to use.
    """
    return tf.keras.layers.Lambda(lambda x: x * 4.0 + 1.0, name=name)

#ext_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ext_sigmoid")(features)
#ext = personality_scaling("ext")(ext_sigmoid)

#ope_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="ope_sigmoid")(features)
#ope = personality_scaling("ope")(ope_sigmoid)
    
#agr_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="agr_sigmoid")(features)
#agr = personality_scaling("agr")(agr_sigmoid)
    
neu_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="neu_sigmoid")(features)
neu = personality_scaling("neu")(neu_sigmoid)
    
#con_sigmoid = tf.keras.layers.Dense(units=1, activation="sigmoid", name="con_sigmoid")(features)
#con = personality_scaling("con")(con_sigmoid)

model_neu = tf.keras.Model(
    inputs= text_features,
    outputs= neu
)    

model_neu.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),  
    loss = 'mse',
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)



In [81]:
# Neu model train
x_txt = x_train.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'
history_neu = model_neu.fit(x_txt, y_train['neu'].values, shuffle=True, batch_size=32, epochs=50, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [83]:
# Neu model test
x_txt_v = x_val.iloc[:, :91].values

#'ope', 'con', 'ext', 'agr', 'neu'

score_neu = model_neu.evaluate(x_txt_v, y_val['neu'].values, verbose=1)

neu_rmse = np.sqrt(np.mean((y_val['neu'] - y_train['neu'].mean())**2))
neu_rmse




0.7774626662037993

In [122]:
model_neu.save('saved_models/neu_model.h5')


In [99]:
# Split train and test set for Age Group Classifier

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    train_features, # training features to split
    train_labels, # training labels to split
    test_size = 0.2, # between 0 and 1, proportion of sample in validation set (e.g., 0.2)
    shuffle= True,
    stratify = train_labels['age_group']
    # random_state = 42  # can use to always obtain the same train/validation split
    )


In [100]:
# Hyper-parameters model age with embedded likes

batch_size=64
num_layers=2
dense_units=64
activation='tanh'
optimizer='sgd'
learning_rate=0.00005
l1_reg=0.0025
l2_reg=0.005
#num_like_pages=5000
use_dropout=True
dropout_rate=0.1
use_batchnorm=False

#age_weights = [0.42100598, 0.98445596, 2.27817746, 5.88235294]
    
num_text_features = 91
num_image_features = 65 # added back noface and multiface    
#num_like_features = 8


In [101]:
# calculating weights for age categories w sklearn
#https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html

a_weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train['age_group']), y= y_train['age_group'])

age_weights_dict = {}

for i in range(len(a_weights)):
    age_weights_dict[i] = a_weights[i]
    
print(age_weights_dict)   


{0: 0.4189636163175303, 1: 0.9890681936491411, 2: 2.272727272727273, 3: 6.1688311688311686}


In [87]:
# Test model: likes only (embedded)
max_len = 2145

model_test = tf.keras.Sequential()
model_test.add(tf.keras.layers.Embedding(10000, 8, input_length=max_len))
model_test.add(tf.keras.layers.Flatten())
model_test.add(tf.keras.layers.Dense(units=4, activation='softmax'))

model_test.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),
    #optimizer = 'adam',
    loss = 'categorical_crossentropy',
    #loss_weights = age_weights_dict,
    metrics = ['acc', 'categorical_accuracy'])

print(model_test.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 2145, 8)           80000     
_________________________________________________________________
flatten_8 (Flatten)          (None, 17160)             0         
_________________________________________________________________
dense_20 (Dense)             (None, 4)                 68644     
Total params: 148,644
Trainable params: 148,644
Non-trainable params: 0
_________________________________________________________________
None


In [89]:
# Train model
x_train_lik = x_train.iloc[:, 156:].values

# transform age groups labels into one_hot vectors
y_train_age = tf.keras.utils.to_categorical(y_train['age_group'].values)

#history = model_test.fit(x_train_pad, y_train_age, shuffle=True, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight=age_weights_dict)
history_age_likes = model_test.fit(x_train_lik, y_train_age, shuffle=True, batch_size=64, epochs=30, verbose=1, validation_split=0.2)



Train on 6080 samples, validate on 1520 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [91]:
x_val_lik = x_val.iloc[:, 156:].values

y_val_age = tf.keras.utils.to_categorical(y_val['age_group'].values)

score_age_likes = model_test.evaluate(x_val_lik, y_val_age, verbose=1)

# baseline for age group:
y_val['age_group'].value_counts()[0]/y_val.shape[0]




0.5968421052631578

In [108]:
# model category age using embedding for likes

max_len = 2145

image_features = tf.keras.Input([num_image_features], dtype=tf.float32, name="image_features")
text_features  = tf.keras.Input([num_text_features], dtype=tf.float32, name="text_features")
likes_features = tf.keras.Input([max_len], dtype=tf.int32, name="likes_features")

likes_embedding_block = tf.keras.Sequential(name="likes_embedding_block")
likes_embedding_block.add(tf.keras.layers.Embedding(10000, 8, input_length=max_len))
likes_embedding_block.add(tf.keras.layers.Flatten())

condensed_likes = likes_embedding_block(likes_features)

dense_layers = tf.keras.Sequential(name="dense_layers")
dense_layers.add(tf.keras.layers.Concatenate())
for i in range(num_layers):
    dense_layers.add(tf.keras.layers.Dense(
        units=dense_units,
        activation= 'tanh', #'tanh',
        kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1_reg, l2=l2_reg),      
        ))
        
    dense_layers.add(tf.keras.layers.Dropout(dropout_rate))
    
features = dense_layers([text_features, image_features, condensed_likes])

age_group = tf.keras.layers.Dense(units=4, activation="softmax", name="age_group")(features)

model_age = tf.keras.Model(
    inputs=[text_features, image_features, likes_features],
    outputs= age_group
)    

model_age.compile(
    optimizer = tf.keras.optimizers.get({"class_name": 'ADAM',
                               "config": {"learning_rate": 0.0005}}),    
    loss = 'categorical_crossentropy',
    metrics = ['acc', 'categorical_accuracy']
)

print(model_age.summary())


Model: "model_22"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
likes_features (InputLayer)     [(None, 2145)]       0                                            
__________________________________________________________________________________________________
text_features (InputLayer)      [(None, 91)]         0                                            
__________________________________________________________________________________________________
image_features (InputLayer)     [(None, 65)]         0                                            
__________________________________________________________________________________________________
likes_embedding_block (Sequenti (None, 17160)        80000       likes_features[0][0]             
___________________________________________________________________________________________

In [109]:
x_train_txt = x_train.iloc[:, :91].values
x_train_img = x_train.iloc[:, 91:156].values
x_train_lik = x_train.iloc[:, 156:].values

y_train_age = tf.keras.utils.to_categorical(y_train['age_group'].values)

#history = model_age.fit([x_train_txt, x_train_img, x_train_lik], y_train_age, shuffle=True, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight=age_weights_dict)
history_age = model_age.fit([x_train_txt, x_train_img, x_train_lik], y_train_age, shuffle=True, batch_size=64, epochs=30, verbose=1, validation_split=0.2)


Train on 6080 samples, validate on 1520 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [110]:
x_val_txt = x_val.iloc[:, :91].values
x_val_img = x_val.iloc[:, 91:156].values
x_val_lik = x_val.iloc[:, 156:].values

y_val_age = tf.keras.utils.to_categorical(y_val['age_group'].values)

score_age = model_age.evaluate([x_val_txt, x_val_img, x_val_lik], y_val_age, verbose=1)

# baseline for age group:
y_val['age_group'].value_counts()[0]/y_val.shape[0]




0.5968421052631578

In [121]:
model_age.save('saved_models/age_model_embedding.h5')
