In [None]:
import pandas as pd
import numpy as np
import os # DEBUG
from glob import glob
from pandas_profiling import ProfileReport
import yaml
import re


In [None]:
PATH_ABS_SRC = os.getcwd()
PATH_REL = os.path.dirname(os.getcwd())

# Mapping File

In [None]:
with open("naming.yaml") as stream:
    naming = yaml.safe_load(stream)

In [None]:
df = pd.read_csv('../data/profiles_revised.csv')

In [None]:
df.head()

In [None]:
profile = ProfileReport(df, title='Pandas Profilign Report')
#profile.to_widgets() # does not show anything
profile.to_notebook_iframe()
profile.to_file("pandas_profiling_data_report.html")

In [None]:
def print_col_values(list, filename):
    with open(r'{}.txt'.format(filename), 'w') as fp:
        for element in list:
            fp.write("{}\n".format(element))

# Zodiac Sign

In [None]:
# Extract Col
zodiacs = df.sign.unique()
print_col_values(list=zodiacs, filename='zodiacs')

ZODIAC_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "

# Clean
zodiacs = [z for z in zodiacs if str(z) != 'nan'] # remove nan values
zodiacs = [v.replace('&rsquo;', '\'') for v in zodiacs] # replace '

# Check
print_col_values(list=zodiacs, filename='zodiacs-cleaned')



In [None]:
# copy by value
df_zodiac = df.copy()

# nan's, and spelling
df_zodiac.dropna(inplace=True, subset=['sign']) # remove nan's
df_zodiac.shape # (48890, 19) , same as profiler

# extract only sign
df_zodiac['sign-extracted'] = df_zodiac['sign'].str.split(' ').str[0]

# extract sign modifier
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign'].str.split(' ').str[1:]
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].apply(lambda y: '' if len(y)==0 else y) # replace empty lists with ''
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].apply(lambda y: ' '.join(y) if len(y)!=0 else y) # join list of strings together
df_zodiac['sign-modifier-extracted'] = df_zodiac['sign-modifier-extracted'].str.replace(ZODIAC_STRING_REPLACMENT,'\'')  # replace 

# map sign modifier + ordinal classifier
mapper_naming_dict = naming['zodiac_hiearchy'] 
df_zodiac['sign-modifier-extracted-ordinal'] =  df_zodiac['sign-modifier-extracted'].map(mapper_naming_dict).fillna(df_zodiac['sign-modifier-extracted']) # map values from dict according to string
df_zodiac[['sign', 'sign-extracted', 'sign-modifier-extracted', 'sign-modifier-extracted-ordinal']]

# Languages

In [None]:
# copy by value
df_languages = df.copy()

# nan's
df_languages.dropna(inplace=True, subset=['speaks']) # remove nan's
df_languages.shape # (59896, 19) , same as profiler

df_languages['spoken_languages'] = np.nan
# structure: language (level), langauge2 .... || language, langauge2, ...
# due to the n:m relationship between persons and languages we will choose a one key encoding for the data
# to not overload the main dataframe we will create a separate df with the information of the languages and use teh same ID as in the main dataframe. The structure of the new df will look as followed:

########################################################
#  ID #  english  #  italian  #  spanish  # ... other # number of languages spoken
#  1       1           0           0         0    1           2
#  2       1           1           1         1    1           5
#  3       0           1           1         0    1           3
#  4       1           0           0         0    0           1
#  5       1           0           1         0    1           3
#
# extract language, without level => split by comma => multiple values, split by space, use first
#
allLanguages = list()
allLng = list()
for row in df_languages.iterrows():
    languages = list()
    for language in row[1].speaks.split(','):
        languages.append(language.strip().split(' ')[0])
        allLng.append(language.strip().split(' ')[0])
    allLanguages.append(languages)

df_languages['spoken_languages'] = allLanguages
df_languages['spoken_languages']

In [None]:
t_df = pd.DataFrame(allLng)
print(sorted(set(allLng)))

In [None]:
t_df.value_counts().plot(kind='bar', figsize=(16,6))