# Python Code for Author Profiling (Machine Learning)

First lets import the classes and libraries needed for the project, then we can create a Word2Vec model from it:

In [None]:
from gensim.models.word2vec import Word2Vec
import xml.etree.ElementTree as ET
import os
import re
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from collections import Counter, defaultdict
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, cross_val_score
from multiprocessing import cpu_count

class User:
	def __init__(self, user_id):
		self.user_id = user_id
		self.documents = []

	def add_document(self, text):
		self.documents.append(text)

	def get_documents(self):
		return self.documents


def load_users(path):
    users = []
    files = os.listdir(path)
    files.remove('truth.txt')
    for xml_filename in files:
        tree = ET.parse(path + xml_filename)
        root = tree.getroot()
        user_id = root.attrib['id']
        user = User(user_id)

        # add tweets to user
        for child in root:
            user.add_document(child.text)

        users.append(user)

    return users

def get_users_truth(path):
    users_truth = {}
    f = open(path + 'truth.txt', 'r')
    lines = f.readlines()
    for line in lines:
        toks = line.split(':::')
        users_truth[toks[0]] = [toks[1], toks[2], toks[3], toks[4], toks[5], toks[6], toks[7]]

    return users_truth


#path must have the directory where the dataset is saved
#users and truth have the twitter posts and the labels of the samples
#the whole dataset is tokenized to create the Word2Vec embeddings

path = '/working directory/pan15-author-profiling-training-dataset-english-2015-03-02/'
users = load_users(path)
truth = get_users_truth(path)

def get_all_documents(users):
    docs = {}

    for user in users:
        rez = []
        for x in user.get_documents():
            rez = rez + re.findall(r"\w+", x)

        docs[user] = rez

    return docs


docs = {}


def get_tokens_per_gender(users):
    user_sentences = get_all_documents(users)

    texts_M = []
    texts_F = []
    all_texts = []

    for user, doc in iter(user_sentences.items()):
        rez = [x.lower() for x in doc ]
        rez_stop = [x.lower() for x in doc]
        all_texts.append(rez_stop)
        if (truth[user.user_id][0] == 'M'):
            texts_M += rez
        else:
            texts_F += rez
    return [texts_M, texts_F, all_texts]


def get_tokens(users):
    user_sentences = get_all_documents(users)
    all_texts = []

    for user, doc in iter(user_sentences.items()):
        rez_stop = [x.lower() for x in doc]
        all_texts.append(rez_stop)

    return [all_texts]

def get_trgts(users, truth, trgt):

    labels = []

    for user in users:
        if trgt > 1:
            labels.append(float(truth[user.user_id][trgt]))
        else:
            labels.append(truth[user.user_id][trgt])

    return [labels]


Next the Word2Vec model is created and functions to vectorize the documents from the samples are defined.
"y_raw = get_trgts(users, truth, 1)" 1 means the 2nd label, labels are drawn from "truth" array from 0 to 6. 
0-Gender, 1-Age range, (2-6 Personality traits, float values) 2-(E), 3-(S), 4-(A), 5-(C), 6-(O).

In [None]:
class MeanEmbeddings(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


class TfidfEmbeddings(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))


    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in words if w in self.word2vec] or
                    [np.zeros(self.dim)], axis=0)
            for words in X
        ])



m, f, all_text = get_tokens_per_gender(users)
y_raw = get_trgts(users, truth, 1) #get each target one at a time to create different classifiers
y = y_raw[0]
X = all_text


# size : embed dimension
# min_count : filter words without min frequency
# sg : 0 for CBOW; 1 for skip-gram
model = Word2Vec(all_text, size=500, min_count=30, sg=1, workers=cpu_count(), window=10) #paraneters found through a grid search
w2v = dict(zip(model.wv.index2word, model.wv.syn0))


Next the classifiers are trained using a pipeline to fit the algorithms with the embeddings from Word2Vec.
There are 2 type of algorithms, Classifiers (Cs) and Regressors (Rs), Cs are used for labes 0,1 and Rs for labels (2-6). for the Cs a One-Vs-the_Rest strategy is used to tackle multiclass problems. A grid search is performed in the SVM classifier. 
Note. This code must be executed once for each label.

Lets see the Cs first:

In [None]:
OVR_RF = OneVsRestClassifier(RandomForestClassifier(n_estimators=200))
CRnd_Frst_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Random Forest Cl tfidf", OVR_RF)])

OVR_ET = OneVsRestClassifier(ExtraTreesClassifier(n_estimators=200))
CExt_Trees_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Extra Trees Cl tfidf", OVR_ET)])

parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':[0.5, 0.9, 1, 10], 'degree':[1, 3]}
svm = SVC()
clf = GridSearchCV(svm, parameters)
clf = OneVsRestClassifier(clf)
CSVM_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("SVM tfidf", clf)])

OVR_GNB = OneVsRestClassifier(GaussianNB())
CGNB_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("SVM tfidf", OVR_GNB)])

Next the Rs:


In [None]:
RRnd_Frst_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Random Forest Rg tfidf", RandomForestRegressor(n_estimators=100))])

RExt_Trees_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Extra Trees Rg tfidf", ExtraTreesRegressor(n_estimators=100, max_features='auto'))])

parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':[0.5, 0.9, 1, 10], 'degree':[1, 3], 'epsilon':[0.1, 0.2, 0.3]}
svr = SVR()
svr = GridSearchCV(svr, parameters)
RSVR_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Support Vector Rg tfidf", svr)])

RBay_Rdg_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddings(w2v)),
                        ("Bayesian Ridge Rg tfidf", linear_model.BayesianRidge())])

Finally the algorithms can execute on the test partition of the dataset. A 10-fold cross valdation was performed.

In [None]:
#Classifiers

scoring = 'accuracy'
folds = 10

print(cross_val_score(CRnd_Frst_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())
print(cross_val_score(CExt_Trees_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())
print(cross_val_score(CSVM_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())
print(cross_val_score(CGNB_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())



In [None]:
#Regressors

scoring = 'neg_mean_squared_error'
folds = 10

print(np.math.sqrt(abs(cross_val_score(RRnd_Frst_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())))
print(np.math.sqrt(abs(cross_val_score(RExt_Trees_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())))
print(np.math.sqrt(abs(cross_val_score(RSVR_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())))
print(np.math.sqrt(abs(cross_val_score(RBay_Rdg_w2v_tfidf, X, y, cv=folds, scoring=scoring).mean())))


For the Recurrent Neural Network (RNN-LSTM) a few modifications to the code above were made. First the Word2Vec model was not averaged to represent each document from the samples. The RNN approach used each document converted word by word to their Word2Vec numerical vector and then partitioned into Train and Test slices, using the same 10-fold cross validation.


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.wrappers.scikit_learn import KerasRegressor
from numpy.ma import zeros

seed = 7
np.random.seed(seed)


sequential models in Keras only accept numeric values, so the labels must be converted for the One-Vs-the-Rest strategy on the classification sub-problems


In [None]:
def get_trgts_num(users, truth, trgt):
    labels = []
    for user in users:
        if trgt > 1:
            labels.append(float(truth[user.user_id][trgt]))
        else:
            if trgt == 0:
                if truth[user.user_id][trgt] == 'M':
                    labels.append(0)
                else:
                    labels.append(1)
            else:
                if truth[user.user_id][trgt] == '18-24':
                    labels.append(0)
                elif truth[user.user_id][trgt] == '25-34':
                    labels.append(0)
                elif truth[user.user_id][trgt] == '35-49':
                    labels.append(0)
                elif truth[user.user_id][trgt] == '50-XX':
                    labels.append(1)
    return [labels]



LSTM models in Keras only accept fixed-length inputs:


In [None]:
# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100
num_categories = 1 # 2, 3, etc.

document_X = {}
document_Y = {}

w2v_model = Word2Vec(all_text, size=num_features, min_count=5, sg=1, window=10, workers=cpu_count())
w2v_model.init_sims(replace=True)
num_samples = len(all_text)

X = zeros(shape=(num_samples, document_max_num_words, num_features)).astype('float32')
Y = zeros(shape=(num_samples, num_categories)).astype('float32')

empty_word = zeros(num_features).astype('float32')


for idx, document in enumerate(all_text):
    for jdx, word in enumerate(document):
        if jdx == document_max_num_words:
            break

        else:
            if word in w2v_model:
                X[idx, jdx, :] = w2v_model[word]
            else:
                X[idx, jdx, :] = empty_word


for idx, key in enumerate(labels):
    Y[idx, :] = key
    

Two type of LSTM models need to be defined in order to solve Classification and Regression problems.
The Classification sub-problem is engaged by this model:


In [None]:
kf = KFold(n_splits=10) 
cvscores = []
for train, test in kf.split(X, Y):
    model = Sequential()
    model.add(LSTM(int(document_max_num_words*1.5), input_shape=(document_max_num_words, num_features)))
    model.add(Dropout(0.35))
    model.add(Dense(num_categories))

    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=['accuracy'])

    # Train model
    model.fit(X[train], Y[train], epochs=10, batch_size=16, verbose=0)

    score, acc = model.evaluate(X[test], Y[test], verbose=0)
    cvscores.append(acc)

print(np.mean(cvscores))


And finally the LSTM model for regression is as follows:


In [None]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(LSTM(int(document_max_num_words * 1.5), input_shape=(document_max_num_words, num_features)))
    model.add(Dropout(0.35))
    model.add(Dense(1))
    model.compile(loss="mean_squared_error", optimizer="adamax") 
    
    return model


estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=10, batch_size=16, verbose=0)


kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
rmse = 0.0
for x in results:
    print(np.math.sqrt(abs(x)))
    rmse += np.math.sqrt(abs(x))
rmse /= len(results)
print("Results: %.4f (%.4f) MSE" % (results.mean(), results.std()))
print("Results: %.4f RMSE" % (rmse))

