# Part 1: Environment Setup

In [15]:
import os
os.getcwd()
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
import pickle
from sklearn.model_selection import train_test_split

def reset_random_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

reset_random_seed()

In [16]:
essays_path = "./raw/essays.csv"
mbti_path = "./raw/mbti.csv"

In [17]:
essays_psychofeature_path = "./psychofeature/essays_mairesse_labeled.csv"
mbti_psychofeature_path = "./psychofeature/mbti_mairesse_labeled.csv"

# Part 2: Load dataset

In [18]:
essays_dataset = pd.read_csv(essays_path,encoding='iso-8859-1')
mbti_dataset = pd.read_csv(mbti_path)

In [19]:
essays_psychofeature = pd.read_csv(essays_psychofeature_path)
mbti_psychofeature = pd.read_csv(mbti_psychofeature_path)

# Part 3: Organise dataset

In [20]:
essays_dataset.columns = ["user", "text", "E", "N", "A", "C", "O"]
essays_dataset['user'] = essays_dataset.index
for dim in ["O", "C","E", "A","N"]:
    essays_dataset[dim] = essays_dataset[dim].map({'y':'1','n':'0'})
for i,ed in essays_dataset.iterrows():
    essays_dataset.at[i,"ptype"] = str(int(str(ed["O"])+str(ed["C"])+str(ed["E"])+str(ed["A"])+str(ed["N"]),2))

essays_dataset = essays_dataset[[ "O", "C","E", "A","N","ptype","text"]]

In [21]:
mbti_dataset.columns = ["user", "type","text"]
mbti_dataset['O'] = np.select(list(map(mbti_dataset['type'].str.contains, ['S','N'])), [0,1])
mbti_dataset['C'] = np.select(list(map(mbti_dataset['type'].str.contains, ['P','J'])), [0,1])
mbti_dataset['E'] = np.select(list(map(mbti_dataset['type'].str.contains, ['I','E'])), [0,1])
mbti_dataset['A'] = np.select(list(map(mbti_dataset['type'].str.contains, ['T','F'])), [0,1])
for i,ed in mbti_dataset.iterrows():
    mbti_dataset.at[i,"ptype"] = str(int(str(ed["O"])+str(ed["C"])+str(ed["E"])+str(ed["A"]),2))
mbti_dataset['user'] = mbti_dataset.index
mbti_dataset = mbti_dataset[["O", "C","E", "A","ptype", "text"]]

# Part 5: Dataset Spliting

In [22]:

def split(dataset):
    return train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset.ptype)

essays_otrainset,essays_evaluationset = split(essays_dataset)
mbti_otrainset,mbti_evaluationset = split(mbti_dataset)

essays_trainset,essays_validationset = split(essays_otrainset)
mbti_trainset,mbti_validationset = split(mbti_otrainset)

In [23]:
#save as csv
essays_trainset.to_csv("./working/essays_trainset.csv",index=False)
essays_validationset.to_csv("./working/essays_validationset.csv",index=False)
essays_evaluationset.to_csv("./working/essays_evaluationset.csv",index=False)

mbti_trainset.to_csv("./working/mbti_trainset.csv",index=False)
mbti_validationset.to_csv("./working/mbti_validationset.csv",index=False)
mbti_evaluationset.to_csv("./working/mbti_evaluationset.csv",index=False)