# Imports

In [517]:
import pandas as pd
import numpy as np
import os # DEBUG
from glob import glob
from pandas_profiling import ProfileReport
import yaml
import re
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import category_encoders
%matplotlib inline

# Config

In [518]:
PATH_ABS_SRC = os.getcwd()
PATH_REL = os.path.dirname(os.getcwd())

In [519]:
with open("naming.yaml") as stream:
    naming = yaml.safe_load(stream)

# Load df

In [520]:
df = pd.read_csv('../data/profiles_revised.csv')

In [521]:
# CONSTS
ZODIAC_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "
OFFSPRING_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "
cols = df.columns.tolist()
cols

['age',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'height',
 'income',
 'job',
 'offspring',
 'orientation',
 'pets',
 'religion',
 'sex',
 'sign',
 'smokes',
 'speaks',
 'status']

# Cleaning

In [522]:
# Using standard scaler
def std_scaler(df, col_names):
    scaled_features = df.copy()
 
    features = scaled_features[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
 
    scaled_features[col_names] = features

    return scaled_features


# Using min/max scaler
def minmax_scaler(df, col_names):
    scaled_features = df.copy()
 
    features = scaled_features[col_names]
    scaler = MinMaxScaler().fit(features.values)
    features = scaler.transform(features.values)
 
    scaled_features[col_names] = features

    return scaled_features

## Age

In [523]:
# Remove nan's
df.dropna(inplace=True, subset=['age'])

# Scale
df = std_scaler(df, ['age'])


# Body Type

In [524]:
# Remove nan's
df.dropna(inplace=True, subset=['body_type'])

# Encode body type
body_type_encoder = LabelEncoder()
body_type_encoder.fit(df['body_type'])
encoded_col_body_type = body_type_encoder.transform(df['body_type'])
df['body_type'] = encoded_col_body_type

# Todo: Consultation whether mapping (clearly unhealthy => -1, not optimal/unknown => 0, else => +1) justifiable 

# Diet

In [525]:
# Remove nan's
df.dropna(inplace=True, subset=['diet'])

# Extract only diet
df['diet_extracted'] = df['diet'].str.split(' ').str[-1]

# Extract diet modifier
df['diet_modifier_extracted'] = df['diet'].str.split(' ').str[:-1]
df['diet_modifier_extracted'] = df['diet_modifier_extracted'].apply(lambda y: '' if len(y)==0 else y[0]) # replace empty lists with '' and extract term from list

# Todo: Consultation whether further mapping makes sense

# Encode diet
diet_encoder = LabelEncoder()
diet_encoder.fit(df['diet_extracted'])
encoded_col_diet = diet_encoder.transform(df['diet_extracted'])
df['diet'] = encoded_col_diet

# Encode diet modifier
diet_modifier_encoder = LabelEncoder()
diet_modifier_encoder.fit(df['diet_modifier_extracted'])
encoded_col_diet_modifier = diet_modifier_encoder.transform(df['diet_modifier_extracted'])
df['diet_modifier'] = encoded_col_diet_modifier


# Drop reduandant cols
df = df.drop('diet_extracted', axis=1)
df = df.drop('diet_modifier_extracted', axis=1)

# Drinks

In [526]:
# Remove nan's
df.dropna(inplace=True, subset=['drinks'])

# Encode drinks modifier
drinks_encoder = LabelEncoder()
drinks_encoder.fit(df['drinks'])
encoded_col_drinks = drinks_encoder.transform(df['drinks'])
df['drinks'] = encoded_col_drinks

# Drugs

In [527]:
# Remove nan's
df.dropna(inplace=True, subset=['drugs'])

# Encode drugs modifier
drinks_encoder = LabelEncoder()
drinks_encoder.fit(df['drugs'])
encoded_col_drugs = drinks_encoder.transform(df['drugs'])
df['drugs'] = encoded_col_drugs

# Education

In [528]:
# Remove nan's
df.dropna(inplace=True, subset=['education'])


# Extract only education institution
# todo find better solution to use the dedicated mapper in naming.yaml
def education_institution_mapper(x):
    if 'college/university' in x:
        return 'college/university'
    if 'two-year college' in x:
        return 'two-year college'
    if 'masters program' in x:
        return 'masters program'
    if 'ph.d program' in x:
        return 'ph.d program'
    if 'high school' in x:
        return 'high school'
    if 'law school' in x:
        return 'law school'
    if 'med school' in x:
        return 'med school'
    if 'space camp' in x:
        return 'space camp'

# Extract only education status
def education_status_mapper(x):
    if 'dropped out of' in x:
        return 'dropped out of'
    if 'working on' in x:
        return 'working on'
    if 'graduated from' in x:
        return 'graduated from'


df['education_status_extracted'] = df['education'].apply(lambda x: education_status_mapper(x))
df['education_institution_extracted'] = df['education'].apply(lambda x: education_institution_mapper(x))


# Encode education_status
education_status_encoder = LabelEncoder()
education_status_encoder.fit(df['education_status_extracted'])
encoded_col_education_status = education_status_encoder.transform(df['education_status_extracted'])
df['education_status_extracted'] = encoded_col_education_status

# Encode diet modifier
education_institution_encoder = LabelEncoder()
education_institution_encoder.fit(df['education_institution_extracted'])
encoded_col_education_institution = education_institution_encoder.transform(df['education_institution_extracted'])
df['education_institution_extracted'] = encoded_col_education_institution

# Drop reduandant cols
df = df.drop('education', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,income,job,offspring,...,pets,religion,sex,sign,smokes,speaks,status,diet_modifier,education_status_extracted,education_institution_extracted
0,-1.089092,0,0,4,0,"asian, white",75.0,-1,transportation,"doesn&rsquo;t have kids, but might want them",...,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single,2,2,0
1,0.386157,2,3,2,2,white,70.0,80000,hospitality / travel,"doesn&rsquo;t have kids, but might want them",...,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,1,2,6
6,-0.035343,4,0,4,0,"white, other",65.0,-1,,,...,likes dogs and likes cats,,f,virgo,,english,single,2,1,0
7,-0.246093,2,0,4,0,white,65.0,-1,artistic / musical / writer,"doesn&rsquo;t have kids, but wants them",...,likes dogs and likes cats,christianity,f,sagittarius,no,"english, spanish (okay)",single,1,1,0
9,0.491531,1,0,1,0,white,65.0,-1,student,,...,likes dogs and likes cats,atheism and laughing about it,m,cancer but it doesn&rsquo;t matter,no,english (fluently),single,1,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59931,-0.772967,4,5,3,0,indian,72.0,-1,science / tech / engineering,,...,likes dogs,agnosticism but not too serious about it,m,,no,"english (fluently), hindi (poorly), french (po...",single,2,1,0
59936,-0.772967,2,0,4,0,asian,61.0,-1,other,doesn&rsquo;t have kids,...,,atheism,f,virgo but it doesn&rsquo;t matter,no,"english (fluently), chinese (fluently)",single,1,1,0
59942,-0.878342,4,0,2,2,"white, other",72.0,-1,entertainment / media,doesn&rsquo;t have kids,...,likes dogs and likes cats,agnosticism,m,leo but it doesn&rsquo;t matter,no,english (fluently),single,1,2,0
59943,0.913031,2,0,1,0,asian,71.0,100000,construction / craftsmanship,doesn&rsquo;t have kids,...,,christianity but not too serious about it,m,sagittarius but it doesn&rsquo;t matter,no,english (fluently),single,1,1,3


# Ethnicity

In [529]:
# Extract all ethnicities categories
# Get all distinct values for the ethnicity  col
ethnicities = df.ethnicity.unique()

# Clean
ethnicities = [e for e in ethnicities if str(e) != 'nan'] # remove nan values

# Extract all ethnicities combinations 
ethnicities = ', '.join(ethnicities)
ethnicities = ethnicities.split(', ') 
ethnicities = [*set(ethnicities)] # create list of "base" ethnicities

# Generate new header for encoded categories
ethnicities_encoded_header = ['ethnicities_{}'.format(e.replace(' ', '_')) for e in ethnicities]


# Remove nan's
df.dropna(inplace=True, subset=['ethnicity'])

# Add col header
for eth_col in ethnicities_encoded_header:
    df[eth_col] = np.nan

# Filter
def filter_ethnicities(col, row_ethnicities):
    # extract all ethnicities from the col 'ethnicity'
    row_ethnicities = row_ethnicities.split(', ')
    
    # compare all extracted to current row in df
    for re in row_ethnicities:
        # match
        if re == col:
            return 1
    # no match
    return 0

# Hot encoding for all ethnicities cols
for (ethnicities_encoded_header_col, e) in zip(ethnicities_encoded_header, ethnicities):
    df[ethnicities_encoded_header_col] = df.apply(lambda x: filter_ethnicities(e, x['ethnicity']), axis=1)

# Drop reduandant cols
df = df.drop('ethnicity', axis=1)

# Height

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['height'])

# Scale
df = std_scaler(df, ['height'])

# Income

In [None]:
# Replace -1 entries
df['income'] = df['income'].apply(lambda y: np.nan if y==-1 else y) # replace -1 with nan
# Todo: Maybe insert non nan but average income (only 5k values after that)

# Remove nan's
df.dropna(inplace=True, subset=['income'])

# Scale
df = std_scaler(df, ['income'])
df

# Job

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['job'])

# Encode drugs modifier
job_encoder = LabelEncoder()
job_encoder.fit(df['job'])
encoded_col_job = job_encoder.transform(df['job'])
df['job'] = encoded_col_job
df

# Offspring

In [None]:
# Extract all offspring categories
# todo: automate

OFFSPRING_STATUS_ORIG = [
    'doesn\'t have kids', 'has a kid', 'has kids'] # STATUS


OFFSPRING_FUTURE_ORIG = [
    'and doesn\'t want any', 'doesn\'t want kids', 'but doesn\'t want more',
    'but might want them', 'might want kids', 'and might want more',
    'wants kids', 'but wants them', 'and wants more'] # FUTURE

OFFSPRING_FUTURE = [
    'doesn\'t want',
    'might want',
    'wants'
]

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['offspring'])

df['offspring'] = df['offspring'].str.replace(OFFSPRING_STRING_REPLACMENT,'\'')  # replace 

offspring_encoded_header = ['offspring_status', 'offspring_future']

# Add col header
for off_col in offspring_encoded_header:
    df[off_col] = np.nan

# Filer
def filter_offspring_status(row_offspring):    
    # compare all extracted to current row in df
    for status in OFFSPRING_STATUS_ORIG:
        if status in row_offspring:
            # match
            return status
    # no match
    return np.nan

# Filter
def filter_offspring_future(row_offspring):    
    # compare all extracted to current row in df
    for future in OFFSPRING_FUTURE:
        if future in row_offspring:
            # match
            return future
    # no match
    return np.nan

# Hot encoding for both offspring cols
df['offspring_status'] = df.apply(lambda x: filter_offspring_status(x['offspring']), axis=1)
df['offspring_future'] = df.apply(lambda x: filter_offspring_future(x['offspring']), axis=1)

df.dropna(inplace=True, subset=['offspring_status'])
df.dropna(inplace=True, subset=['offspring_future'])


# Encode offspring_status
offspring_status_encoder = LabelEncoder()
offspring_status_encoder.fit(df['offspring_status'])
encoded_col_offspring_status = offspring_status_encoder.transform(df['offspring_status'])
df['offspring_status'] = encoded_col_offspring_status

# Encode offspring_future
offspring_future_encoder = LabelEncoder()
offspring_future_encoder.fit(df['offspring_future'])
encoded_col_offspring_future = offspring_future_encoder.transform(df['offspring_future'])
df['offspring_future'] = encoded_col_offspring_future


# Drop reduandant cols
df = df.drop('offspring', axis=1)
df

# Orientation

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['orientation'])

# Encode orientation
orientation_encoder = LabelEncoder()
orientation_encoder.fit(df['orientation'])
encoded_col_orientation = orientation_encoder.transform(df['orientation'])
df['orientation'] = encoded_col_orientation
df

# Pets

# Religion

# Sex

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['sex'])

# Encode drugs modifier
sex_encoder = LabelEncoder()
sex_encoder.fit(df['sex'])
encoded_col_sex = sex_encoder.transform(df['sex'])
df['sex'] = encoded_col_sex
df

# Sign

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['sign'])


# Extract only sign
df['sign_extracted'] = df['sign'].str.split(' ').str[0]

# Extract sign modifier
df['sign_modifier_extracted'] = df['sign'].str.split(' ').str[1:]
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].apply(lambda y: '' if len(y)==0 else y) # replace empty lists with ''
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].apply(lambda y: ' '.join(y) if len(y)!=0 else y) # join list of strings together
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].str.replace(ZODIAC_STRING_REPLACMENT,'\'')  # replace 

# Encode sign
sign_encoder = LabelEncoder()
sign_encoder.fit(df['sign_extracted'])
encoded_col_sign = sign_encoder.transform(df['sign_extracted'])
df['sign_extracted'] = encoded_col_sign

# Encode sign modifier
sign_modifier_encoder = LabelEncoder()
sign_modifier_encoder.fit(df['sign_modifier_extracted'])
encoded_col_sign_modifier = sign_modifier_encoder.transform(df['sign_modifier_extracted'])
df['sign_modifier_extracted'] = encoded_col_sign_modifier

# Drop reduandant cols
df = df.drop('sign', axis=1)
df

# Smokes

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['smokes'])

# Encode smokes modifier
smokes_encoder = LabelEncoder()
smokes_encoder.fit(df['smokes'])
encoded_col_smokes = smokes_encoder.transform(df['smokes'])
df['smokes'] = encoded_col_smokes
df

# Speaks

# Status

In [None]:
# Remove nan's
df.dropna(inplace=True, subset=['status'])

# Encode drugs modifier
status_encoder = LabelEncoder()
status_encoder.fit(df['status'])
encoded_col_status = status_encoder.transform(df['status'])
df['status'] = encoded_col_status
df