In [None]:
import pandas as pd
import numpy as np
import os # DEBUG
from glob import glob
from pandas_profiling import ProfileReport
import yaml
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import category_encoders
%matplotlib inline

In [None]:
PATH_ABS_SRC = os.getcwd()
PATH_REL = os.path.dirname(os.getcwd())

# Mapping File

In [None]:
with open("naming.yaml") as stream:
    naming = yaml.safe_load(stream)

In [None]:
df = pd.read_csv('../data/profiles_revised.csv')

In [None]:
df.head()

In [None]:
profile = ProfileReport(df, title='Pandas Profilign Report')
#profile.to_widgets() # does not show anything
profile.to_notebook_iframe()
profile.to_file("pandas_profiling_data_report.html")

In [None]:
def print_col_values(list, filename):
    with open(r'{}.txt'.format(filename), 'w') as fp:
        for element in list:
            fp.write("{}\n".format(element))

In [None]:
df_master = df.copy()

# Zodiac Sign

In [None]:
# Extract Col
zodiacs = df.sign.unique()
print_col_values(list=zodiacs, filename='zodiacs')

ZODIAC_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "

# Clean
zodiacs = [z for z in zodiacs if str(z) != 'nan'] # remove nan values
zodiacs = [v.replace('&rsquo;', '\'') for v in zodiacs] # replace '

# Check
print_col_values(list=zodiacs, filename='zodiacs-cleaned')



In [None]:
# copy by value
df_zodiac = df.copy()

# nan's, and spelling
df_zodiac.dropna(inplace=True, subset=['sign']) # remove nan's
df_zodiac.shape # (48890, 19) , same as profiler

# extract only sign
df_zodiac['sign-extracted'] = df_zodiac['sign'].str.split(' ').str[0]

# extract sign modifier
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign'].str.split(' ').str[1:]
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].apply(lambda y: '' if len(y)==0 else y) # replace empty lists with ''
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].apply(lambda y: ' '.join(y) if len(y)!=0 else y) # join list of strings together
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].str.replace(ZODIAC_STRING_REPLACMENT,'\'')  # replace 

# map sign modifier + ordinal classifier
mapper_naming_dict = naming['zodiac_hierarchy'] 
df_zodiac['sign-modifier-extracted-ordinal'] =  df_zodiac['sign-modifier-extracted'].map(mapper_naming_dict).fillna(df_zodiac['sign-modifier-extracted']) # map values from dict according to string
df_zodiac[['sign', 'sign-extracted', 'sign-modifier-extracted', 'sign-modifier-extracted-ordinal']]

# encode signs
sign_encoder = LabelEncoder()
sign_encoder.fit(df_zodiac['sign-extracted'])
encoded_col_sign = sign_encoder.transform(df_zodiac['sign-extracted'])
df_zodiac['sign-extracted-categorical'] = encoded_col_sign
df_zodiac

# Languages

In [None]:
# Extract Col
lng = df.speaks.unique()
print_col_values(list=lng, filename='languages')

In [None]:
# copy by value
df_languages = df.copy()

# nan's
df_languages.dropna(inplace=True, subset=['speaks']) # remove nan's
df_languages.shape # (59896, 19) , same as profiler

df_languages['spoken_languages'] = np.nan
# structure: language (level), langauge2 .... || language, langauge2, ...
# due to the n:m relationship between persons and languages we will choose a one key encoding for the data
# to not overload the main dataframe we will create a separate df with the information of the languages and use teh same ID as in the main dataframe. The structure of the new df will look as followed:

########################################################
#  ID #  english  #  italian  #  spanish  # ... other # number of languages spoken
#  1       1           0           0         0    1           2
#  2       1           1           1         1    1           5
#  3       0           1           1         0    1           3
#  4       1           0           0         0    0           1
#  5       1           0           1         0    1           3
#
# extract language, without level => split by comma => multiple values, split by space, use first
#

# iterate through all values and create lists for all languages, userIDs and column names
allLanguages = list()
allIds = list()
languageColumns = list()
for row in df_languages.iterrows():
    languages = list()
    idsForPerson = list()
    # in this step the language is extracted (the appendix (spoken level e.g. fluid) removed)
    for language in row[1].speaks.split(','):
        languages.append(language.strip().split(' ')[0])
        # indexes are going to be added to the list for the next step (one hot key encoding)
        idsForPerson.append(row[0])
        languageColumns.append(language.strip().split(' ')[0])
    # allLanguages represents a list of languages [english, french, spanish]
    allLanguages.append(languages)
    # allIds represents a the list of the corresponding index [6, 6, 6] (for the lambda below)
    allIds.append(idsForPerson)

# remove all the duplicates from the column list
languageColumns = sorted(set(languageColumns))
encoded_df_languages = pd.DataFrame()
df_languages['language'] = allLanguages
encoded_df_languages['language'] = allLanguages
encoded_df_languages['userID'] = allIds

# split up the values from the array into new rows
# [english, french, spanish] will be 3 rows with the corresponding userID of 6, 6, 6 => df gets form 60k rows to 110k
encoded_df_languages =encoded_df_languages.apply(lambda x: pd.Series(np.concatenate(x.tolist())), 0)

#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'language' column
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df_languages[['language']]).toarray())
encoder_df.columns = languageColumns
# join the new encoded df with the language one
encoded_df_languages = encoded_df_languages.join(encoder_df)
encoded_df_languages.drop(['language'], axis=1, inplace=True)
# group the user that the languages are shown in one row => back to 60k rows
encoded_df_languages.groupby('userID').sum()

# Body Type

In [None]:
# Extract Col
body_type = df['body_type'].unique()
print_col_values(list=body_type, filename='body_type')


In [None]:
# copy by value
df_bodyt = df.copy()

# nan's, and spelling
df_bodyt.dropna(inplace=True, subset=['sign']) # remove nan's
df_bodyt.shape # (48890, 19) , same as profiler

# map sign modifier + ordinal classifier
mapper_body_type_dict = naming['body_type'] 
df_bodyt['body_type_ordinal'] =  df_bodyt['body_type'].map(mapper_body_type_dict).fillna(df_bodyt['body_type']) # map values from dict according to string
df_bodyt.dropna(inplace=True, subset=['body_type_ordinal']) # todo better solution?
df_bodyt

# todo Consultation whether mapping (clearly unhealthy => -1, not optimal/unknown => 0, else => +1) justifiable 

# Diet

In [None]:
# Extract Col
diet = df.diet.unique()
print_col_values(list=diet, filename='diet')

In [None]:
# copy by value
df_diet = df.copy()

# nan's, and spelling
df_diet.dropna(inplace=True, subset=['diet']) # remove nan's
df_diet.shape # (48890, 19) , same as profiler

# extract only sign
df_diet['diet_extracted'] = df_diet['diet'].str.split(' ').str[-1]

# extract sign modifier
df_diet['diet_modifier_extracted'] = df_diet['diet'].str.split(' ').str[:-1]
df_diet['diet_modifier_extracted'] = df_diet['diet_modifier_extracted'].apply(lambda y: '' if len(y)==0 else y[0]) # replace empty lists with '' and extract term from list
df_diet
# todo Consultation whether further mapping makes sense

# Drugs

In [None]:
# Extract Col
drugs = df.drugs.unique()
print_col_values(list=drugs, filename='drugs')

In [None]:
# copy by value
df_drugs = df.copy()

# nan's, and spelling
df_drugs.dropna(inplace=True, subset=['drugs']) # remove nan's
df_drugs.shape # (45866, 19) , same as profiler

# map ordinal classifier
mapper_drugs_dict = naming['drugs']
df_drugs['drugs_ordinal'] =  df_drugs['drugs'].map(mapper_drugs_dict).fillna(df_drugs['drugs']) # map values from dict according to string
df_drugs['drugs_ordinal'] = df_drugs['drugs_ordinal'].apply(lambda y: y if (y in [-2, -1, 1]) else 0) # map empty to 0

# todo Consultation whether mapping (clearly unhealthy => -2, not optimal => -1, optimal => +1, unknown => 0) justifiable

# Drinks

In [None]:
# Extract Col
drinks = df.drinks.unique()
print_col_values(list=drinks, filename='drinks')

In [None]:
# copy by value
df_drinks = df.copy()

# nan's
df_drinks.dropna(inplace=True, subset=['drinks']) # remove nan's
df_drinks.shape # (56961, 19) , same as profiler

# map ordinal classifier
mapper_drinks_dict = naming['drinks']
df_drinks['drinks_ordinal'] =  df_drinks['drinks'].map(mapper_drinks_dict).fillna(df_drinks['drinks'])
# df_drinks['drinks_ordinal'] = df_drinks['drinks_ordinal'].apply(lambda y: y if (y in [-2, -1, 0, 1]) else 0) # todo: check if needed # map empty to 0
# todo Consultation whether mapping (clearly unhealthy => -2, "social drinker" => 0, optimal => +1) justifiable

# Education

In [None]:
# Extract Col
education = df.education.unique()
print_col_values(list=education, filename='education')

In [None]:
# DELETE BEFORE PUSH
with open("naming.yaml") as stream:
    naming = yaml.safe_load(stream)

# copy by value
df_education = df.copy()

# nan's
df_education.dropna(inplace=True, subset=['education']) # remove nan's
df_education.shape # (53318, 19) , same as profiler

# extract only education institution and status 
# todo find better solution to use the dedicated mapper in naming.yaml
def educationa_status_mapper(x):
    if 'dropped out of' in x:
        return 'dropped out of'
    if 'working on' in x:
        return 'working on'
    if 'graduated from' in x:
        return 'graduated from'

def educationa_institution_mapper(x):
    if 'college/university' in x:
        return 'college/university'
    if 'two-year college' in x:
        return 'two-year college'
    if 'masters program' in x:
        return 'masters program'
    if 'ph.d program' in x:
        return 'ph.d program'
    if 'high school' in x:
        return 'high school'
    if 'law school' in x:
        return 'law school'
    if 'med school' in x:
        return 'med school'
    if 'space camp' in x:
        return 'space camp'

df_education['education_status_extracted'] = df_education['education'].apply(lambda x: educationa_status_mapper(x))
df_education['education_instituation_extracted'] = df_education['education'].apply(lambda x: educationa_institution_mapper(x))


# map ordinal classifier
mapper_education_status_hierarchy_dict = naming['education_status_hierarchy']
df_education['education_status_ordinal'] =  df_education['education_status_extracted'].map(mapper_education_status_hierarchy_dict).fillna(df_education['education_status_extracted'])

# Ethnicity

In [None]:
# Extract Col
ethnicity = df.ethnicity.unique()
print_col_values(list=ethnicity, filename='ethnicity')

# Income

In [None]:
# Extract Col
income = df.income.unique()
print_col_values(list=income, filename='income')

In [None]:
# copy by value
df_income = df.copy()

# nan's
df_income.dropna(inplace=True, subset=['income']) # remove nan's
df_income = df_income[df_income.income != -1] # do not consider the -1 values (nan values)
df_income.shape # (11504, 19) , MAX REDUCTION!

df_income['income'].value_counts().sort_index().plot(kind='bar')

# Age

In [None]:
# Extract Col
age = df.age.unique()
print_col_values(list=age, filename='age')

In [None]:
# copy by value
df_age = df.copy()

# nan's
df_age.dropna(inplace=True, subset=['age']) # remove nan's
df_age.shape # (11504, 19) , MAX REDUCTION!

#df_age['age'].value_counts().sort_index().plot(kind='bar', figsize=(12,4))
df_age['age'].plot(kind='box')

# Job

In [None]:
# Extract Col
job = df.job.unique()
print_col_values(list=job, filename='job')

# Offspring

In [None]:
# Extract Col
offspring = df.offspring.unique()
print_col_values(list=offspring, filename='offspring')

# Orientation

In [None]:
# Extract Col
orientation = df.orientation.unique()
print_col_values(list=orientation, filename='orientation')

# Pets

In [None]:
# Extract Col
pets = df.pets.unique()
print_col_values(list=pets, filename='pets')

# Religion

In [None]:
# Extract Col
religion = df.religion.unique()
print_col_values(list=religion, filename='religion')

# Smokes

In [None]:
# Extract Col
smokes = df.smokes.unique()
print_col_values(list=smokes, filename='smokes')

In [None]:
# copy by value
df_smokes = df.copy()

# nan's
df_smokes.dropna(inplace=True, subset=['smokes']) # remove nan's
df_smokes.shape # (54434, 19) , same as profiler

smokes_encoder = category_encoders.OrdinalEncoder(
    cols = ['smokes'],
    return_df = True,
    mapping = [naming['smokes']]
)

df_smokes['smokes_ordinal'] = smokes_encoder.fit_transform(df_smokes['smokes'])
df_smokes[['smokes_ordinal', 'smokes']]

# Gender

In [None]:
# Extract Col
sex = df.sex.unique()
print_col_values(list=sex, filename='sex')
# copy by value
df_sex = df.copy()

# nan's
df_sex.dropna(inplace=True, subset=['sex']) # remove nan's
df_sex.shape # (59946, 19) , same as profiler

sex_encoder = LabelEncoder()
df_sex['sex_categorical'] = sex_encoder.fit_transform(df_sex['sex'])
df_sex[['sex_categorical', 'sex']]

# Status

In [None]:
# Extract Col
status = df.status.unique()
print_col_values(list=status, filename='status')
# copy by value
df_status = df.copy()

# nan's
df_status.dropna(inplace=True, subset=['status']) # remove nan's
df_status.shape # (59946, 19) , same as profiler

status_encoder = LabelEncoder()
df_status['status_categorical'] = status_encoder.fit_transform(df_status['status'])
df_status[['status_categorical', 'status']]

# Height