# Imports

In [77]:
import pandas as pd
import numpy as np
import os # DEBUG
from glob import glob
from pandas_profiling import ProfileReport
import yaml
import re
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from pandas_profiling import ProfileReport
import json
import category_encoders
%matplotlib inline

# Config

In [78]:
PATH_ABS_SRC = os.getcwd()
PATH_REL = os.path.dirname(os.getcwd())

# Load df

In [127]:
df = pd.read_csv('../data/profiles_revised.csv')

In [80]:
# CONSTS
ZODIAC_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "
OFFSPRING_STRING_REPLACMENT = '&rsquo;' # corresponds to " ' "
cols = df.columns.tolist()
cols

['age',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'height',
 'income',
 'job',
 'offspring',
 'orientation',
 'pets',
 'religion',
 'sex',
 'sign',
 'smokes',
 'speaks',
 'status']

# Preprocessing

In [81]:
if not os.path.exists('preprocessing'):
    os.makedirs('preprocessing')

In [82]:
def print_col_values(list, filename):
    file_path = os.path.relpath("preprocessing")
    with open(file_path+'/{}.txt'.format(filename), 'w') as fp:
        for element in list:
            fp.write("{}\n".format(element))

In [83]:
# Extract unique col values
for col in cols:
    print_col_values(list=df[col].unique(), filename=col)


# Analysis

In [84]:
# profile = ProfileReport(df, title='Pandas Profilign Report')
# profile.to_notebook_iframe()
# profile.to_file("pandas_profiling_data_report.html")

# Cleaning

In [85]:
# Using standard scaler
def std_scaler(df, col_names):
    scaled_features = df.copy()
 
    features = scaled_features[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
 
    scaled_features[col_names] = features

    return scaled_features


# Using min/max scaler
def minmax_scaler(df, col_names):
    scaled_features = df.copy()
 
    features = scaled_features[col_names]
    scaler = MinMaxScaler().fit(features.values)
    features = scaler.transform(features.values)
 
    scaled_features[col_names] = features

    return scaled_features

## Age

In [86]:
# Remove nan's
df.dropna(inplace=True, subset=['age'])

# Scale
df = std_scaler(df, ['age'])


# Body Type

In [87]:
# Remove nan's
df.dropna(inplace=True, subset=['body_type'])

# Encode body type
body_type_encoder = LabelEncoder()
body_type_encoder.fit(df['body_type'])
encoded_col_body_type = body_type_encoder.transform(df['body_type'])
df['body_type'] = encoded_col_body_type

# Todo: Consultation whether mapping (clearly unhealthy => -1, not optimal/unknown => 0, else => +1) justifiable 

# Diet

In [88]:
# Remove nan's
df.dropna(inplace=True, subset=['diet'])

# Extract only diet
df['diet_extracted'] = df['diet'].str.split(' ').str[-1]

# Extract diet modifier
df['diet_modifier_extracted'] = df['diet'].str.split(' ').str[:-1]
df['diet_modifier_extracted'] = df['diet_modifier_extracted'].apply(lambda y: '' if len(y)==0 else y[0]) # replace empty lists with '' and extract term from list

# Todo: Consultation whether further mapping makes sense

# Encode diet
diet_encoder = LabelEncoder()
diet_encoder.fit(df['diet_extracted'])
encoded_col_diet = diet_encoder.transform(df['diet_extracted'])
df['diet'] = encoded_col_diet

# Encode diet modifier
diet_modifier_encoder = LabelEncoder()
diet_modifier_encoder.fit(df['diet_modifier_extracted'])
encoded_col_diet_modifier = diet_modifier_encoder.transform(df['diet_modifier_extracted'])
df['diet_modifier'] = encoded_col_diet_modifier


# Drop reduandant cols
df = df.drop('diet_extracted', axis=1)
df = df.drop('diet_modifier_extracted', axis=1)

# Drinks

In [89]:
# Remove nan's
df.dropna(inplace=True, subset=['drinks'])

# Encode drinks modifier
drinks_encoder = LabelEncoder()
drinks_encoder.fit(df['drinks'])
encoded_col_drinks = drinks_encoder.transform(df['drinks'])
df['drinks'] = encoded_col_drinks

# Drugs

In [90]:
# Remove nan's
df.dropna(inplace=True, subset=['drugs'])

# Encode drugs modifier
drinks_encoder = LabelEncoder()
drinks_encoder.fit(df['drugs'])
encoded_col_drugs = drinks_encoder.transform(df['drugs'])
df['drugs'] = encoded_col_drugs

# Education

In [129]:
# Remove nan's
df.dropna(inplace=True, subset=['education'])


# Extract only education institution
# todo find better solution to use the dedicated mapper in naming.yaml
def education_institution_mapper(x):
    if 'college/university' in x:
        return 'college/university'
    if 'two-year college' in x:
        return 'two-year college'
    if 'masters program' in x:
        return 'masters program'
    if 'ph.d program' in x:
        return 'ph.d program'
    if 'high school' in x:
        return 'high school'
    if 'law school' in x:
        return 'law school'
    if 'med school' in x:
        return 'med school'
    if 'space camp' in x:
        return 'space camp'

# Extract only education status
def education_status_mapper(x):
    if 'dropped out of' in x:
        return 'dropped out of'
    if 'working on' in x:
        return 'working on'
    if 'graduated from' in x:
        return 'graduated from'


df['education_status_extracted'] = df['education'].apply(lambda x: education_status_mapper(x))
df['education_institution_extracted'] = df['education'].apply(lambda x: education_institution_mapper(x))


# Encode education_status
education_status_encoder = LabelEncoder()
education_status_encoder.fit(df['education_status_extracted'])
encoded_col_education_status = education_status_encoder.transform(df['education_status_extracted'])
df['education_status_extracted'] = encoded_col_education_status

# Encode diet modifier
education_institution_encoder = LabelEncoder()
education_institution_encoder.fit(df['education_institution_extracted'])
encoded_col_education_institution = education_institution_encoder.transform(df['education_institution_extracted'])
df['education_institution_extracted'] = encoded_col_education_institution

# Drop reduandant cols
df = df.drop('education', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,education_status_extracted,education_institution_extracted
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single,2,0
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,2,6
7,30,average,mostly anything,socially,never,white,65.0,artistic / musical / writer,"doesn&rsquo;t have kids, but wants them",straight,likes dogs and likes cats,christianity,f,sagittarius,no,"english, spanish (okay)",single,1,0
14,29,thin,mostly anything,socially,never,"hispanic / latin, white",62.0,other,"doesn&rsquo;t have kids, but wants them",straight,likes dogs and has cats,catholicism,f,taurus,no,english,single,2,0
19,34,athletic,mostly anything,socially,never,white,72.0,science / tech / engineering,doesn&rsquo;t have kids,straight,likes dogs and likes cats,catholicism but not too serious about it,m,pisces and it&rsquo;s fun to think about,no,english (fluently),single,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59904,32,a little extra,mostly other,not at all,never,white,66.0,clerical / administrative,has kids,straight,likes cats,other and very serious about it,f,cancer,no,english (fluently),single,1,7
59907,25,skinny,anything,socially,sometimes,white,72.0,artistic / musical / writer,doesn&rsquo;t have kids,straight,likes dogs,atheism and somewhat serious about it,m,taurus and it&rsquo;s fun to think about,sometimes,"english (fluently), spanish (okay)",single,1,0
59913,29,full figured,mostly anything,socially,never,black,64.0,executive / management,doesn&rsquo;t have kids,straight,likes dogs and likes cats,agnosticism and laughing about it,f,libra but it doesn&rsquo;t matter,no,"english (fluently), spanish (poorly)",single,1,0
59942,24,fit,mostly anything,often,sometimes,"white, other",72.0,entertainment / media,doesn&rsquo;t have kids,straight,likes dogs and likes cats,agnosticism,m,leo but it doesn&rsquo;t matter,no,english (fluently),single,2,0


# Ethnicity

In [92]:
# Extract all ethnicities categories
# Get all distinct values for the ethnicity  col
ethnicities = df.ethnicity.unique()

# Clean
ethnicities = [e for e in ethnicities if str(e) != 'nan'] # remove nan values

# Extract all ethnicities combinations 
ethnicities = ', '.join(ethnicities)
ethnicities = ethnicities.split(', ') 
ethnicities = [*set(ethnicities)] # create list of "base" ethnicities

# Generate new header for encoded categories
ethnicities_encoded_header = ['ethnicities_{}'.format(e.replace(' ', '_')) for e in ethnicities]


# Remove nan's
df.dropna(inplace=True, subset=['ethnicity'])

# Add col header
for eth_col in ethnicities_encoded_header:
    df[eth_col] = np.nan

# Filter
def filter_ethnicities(col, row_ethnicities):
    # extract all ethnicities from the col 'ethnicity'
    row_ethnicities = row_ethnicities.split(', ')
    
    # compare all extracted to current row in df
    for re in row_ethnicities:
        # match
        if re == col:
            return 1
    # no match
    return 0

# Hot encoding for all ethnicities cols
for (ethnicities_encoded_header_col, e) in zip(ethnicities_encoded_header, ethnicities):
    df[ethnicities_encoded_header_col] = df.apply(lambda x: filter_ethnicities(e, x['ethnicity']), axis=1)

# Drop reduandant cols
df = df.drop('ethnicity', axis=1)

# Height

In [93]:
# Remove nan's
df.dropna(inplace=True, subset=['height'])

# Scale
df = std_scaler(df, ['height'])

# Income

SKIP INCOME

In [94]:
# Replace -1 entries
#df['income'] = df['income'].apply(lambda y: np.nan if y==-1 else y) # replace -1 with nan
# Todo: Maybe insert non nan but average income (only 5k values after that)

# Remove nan's
#df.dropna(inplace=True, subset=['income'])

# Scale
#df = std_scaler(df, ['income'])
#df

df = df.drop('income', axis=1)

# Job

In [95]:
# Remove nan's
df.dropna(inplace=True, subset=['job'])

# Encode drugs modifier
job_encoder = LabelEncoder()
job_encoder.fit(df['job'])
encoded_col_job = job_encoder.transform(df['job'])
df['job'] = encoded_col_job
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,offspring,orientation,pets,...,education_institution_extracted,ethnicities_native_american,ethnicities_indian,ethnicities_asian,ethnicities_white,ethnicities_hispanic_/_latin,ethnicities_black,ethnicities_other,ethnicities_pacific_islander,ethnicities_middle_eastern
0,-1.089092,0,0,4,0,1.690080,19,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,...,0,0,0,1,1,0,0,0,0,0
1,0.386157,2,3,2,2,0.423755,8,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,...,6,0,0,0,1,0,0,0,0,0
7,-0.246093,2,0,4,0,-0.842570,0,"doesn&rsquo;t have kids, but wants them",straight,likes dogs and likes cats,...,0,0,0,0,1,0,0,0,0,0
9,0.491531,1,0,1,0,-0.842570,18,,straight,likes dogs and likes cats,...,7,0,0,0,1,0,0,0,0,0
11,-0.562217,2,0,4,0,0.930285,1,,straight,likes cats,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59931,-0.772967,4,5,3,0,0.930285,17,,straight,likes dogs,...,0,0,1,0,0,0,0,0,0,0
59936,-0.772967,2,0,4,0,-1.855631,12,doesn&rsquo;t have kids,straight,,...,0,0,0,1,0,0,0,0,0,0
59942,-0.878342,4,0,2,2,0.930285,6,doesn&rsquo;t have kids,straight,likes dogs and likes cats,...,0,0,0,0,1,0,0,1,0,0
59943,0.913031,2,0,1,0,0.677020,4,doesn&rsquo;t have kids,straight,,...,3,0,0,1,0,0,0,0,0,0


# Offspring

In [96]:
# Extract all offspring categories
# todo: automate

OFFSPRING_STATUS_ORIG = [
    'doesn\'t have kids', 'has a kid', 'has kids'] # STATUS


OFFSPRING_FUTURE_ORIG = [
    'and doesn\'t want any', 'doesn\'t want kids', 'but doesn\'t want more',
    'but might want them', 'might want kids', 'and might want more',
    'wants kids', 'but wants them', 'and wants more'] # FUTURE

OFFSPRING_FUTURE = [
    'doesn\'t want',
    'might want',
    'wants'
]

In [138]:
# Remove nan's
df.dropna(inplace=True, subset=['offspring'])

df['offspring'] = df['offspring'].str.replace(OFFSPRING_STRING_REPLACMENT,'\'')  # replace 

offspring_encoded_header = ['offspring_status', 'offspring_future']

# Add col header
for off_col in offspring_encoded_header:
    df[off_col] = np.nan

# Filer
def filter_offspring_status(row_offspring):    
    # compare all extracted to current row in df
    for status in OFFSPRING_STATUS_ORIG:
        if status in row_offspring:
            # match
            return status
    # no match
    return np.nan

# Filter
def filter_offspring_future(row_offspring):    
    # compare all extracted to current row in df
    for future in OFFSPRING_FUTURE:
        if future in row_offspring:
            # match
            return future
    # no match
    return np.nan

# Hot encoding for both offspring cols
df['offspring_status'] = df.apply(lambda x: filter_offspring_status(x['offspring']), axis=1)
df['offspring_future'] = df.apply(lambda x: filter_offspring_future(x['offspring']), axis=1)

df.dropna(inplace=True, subset=['offspring_status'])
df.dropna(inplace=True, subset=['offspring_future'])


# Encode offspring_status
offspring_status_encoder = LabelEncoder()
offspring_status_encoder.fit(df['offspring_status'])
encoded_col_offspring_status = offspring_status_encoder.transform(df['offspring_status'])
df['offspring_status'] = encoded_col_offspring_status

# Encode offspring_future
offspring_future_encoder = LabelEncoder()
offspring_future_encoder.fit(df['offspring_future'])
encoded_col_offspring_future = offspring_future_encoder.transform(df['offspring_future'])
df['offspring_future'] = encoded_col_offspring_future


# Drop reduandant cols
df = df.drop('offspring', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,orientation,sex,...,education_status_extracted,education_institution_extracted,religion_type,religion_modifier,sign_extracted,sign_modifier_extracted,pets_cats,pets_dogs,offspring_status,offspring_future
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,straight,m,...,2,0,agnosticism,very serious about it,gemini,,likes cats,likes dogs,doesn't have kids,might want
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,straight,m,...,2,6,agnosticism,not too serious about it,cancer,,likes cats,likes dogs,doesn't have kids,might want
7,30,average,mostly anything,socially,never,white,65.0,artistic / musical / writer,straight,f,...,1,0,christianity,,sagittarius,,likes cats,likes dogs,doesn't have kids,wants
14,29,thin,mostly anything,socially,never,"hispanic / latin, white",62.0,other,straight,f,...,2,0,catholicism,,taurus,,has cats,likes dogs,doesn't have kids,wants
19,34,athletic,mostly anything,socially,never,white,72.0,science / tech / engineering,straight,m,...,1,3,catholicism,not too serious about it,pisces,and it's fun to think about,likes cats,likes dogs,doesn't have kids,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59896,25,curvy,mostly vegetarian,often,sometimes,white,69.0,education / academia,straight,f,...,1,0,agnosticism,,sagittarius,but it doesn't matter,likes cats,likes dogs,doesn't have kids,
59902,25,curvy,mostly other,rarely,never,"native american, hispanic / latin, white",67.0,student,straight,f,...,1,6,other,not too serious about it,scorpio,and it's fun to think about,has cats,has dogs,doesn't have kids,might want
59913,29,full figured,mostly anything,socially,never,black,64.0,executive / management,straight,f,...,1,0,agnosticism,laughing about it,libra,but it doesn't matter,likes cats,likes dogs,doesn't have kids,
59942,24,fit,mostly anything,often,sometimes,"white, other",72.0,entertainment / media,straight,m,...,2,0,agnosticism,,leo,but it doesn't matter,likes cats,likes dogs,doesn't have kids,


# Orientation

In [98]:
# Remove nan's
df.dropna(inplace=True, subset=['orientation'])

# Encode orientation
orientation_encoder = LabelEncoder()
orientation_encoder.fit(df['orientation'])
encoded_col_orientation = orientation_encoder.transform(df['orientation'])
df['orientation'] = encoded_col_orientation
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,pets,religion,...,ethnicities_indian,ethnicities_asian,ethnicities_white,ethnicities_hispanic_/_latin,ethnicities_black,ethnicities_other,ethnicities_pacific_islander,ethnicities_middle_eastern,offspring_status,offspring_future
0,-1.089092,0,0,4,0,1.690080,19,2,likes dogs and likes cats,agnosticism and very serious about it,...,0,1,1,0,0,0,0,0,0,1
1,0.386157,2,3,2,2,0.423755,8,2,likes dogs and likes cats,agnosticism but not too serious about it,...,0,0,1,0,0,0,0,0,0,1
7,-0.246093,2,0,4,0,-0.842570,0,2,likes dogs and likes cats,christianity,...,0,0,1,0,0,0,0,0,0,2
14,-0.351468,10,0,4,0,-1.602366,12,2,likes dogs and has cats,catholicism,...,0,0,1,1,0,0,0,0,0,2
22,-0.140718,4,0,4,0,0.170490,7,2,likes dogs and likes cats,agnosticism and somewhat serious about it,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59873,0.280782,2,0,4,0,-0.589305,5,2,,,...,0,0,1,0,0,0,0,0,0,2
59902,-0.772967,3,3,3,0,-0.336040,18,2,has dogs and has cats,other but not too serious about it,...,0,0,1,1,0,0,0,0,0,1
59903,0.386157,4,0,4,0,-1.855631,10,2,likes dogs,,...,0,0,1,0,0,0,0,0,0,1
59923,-1.089092,7,0,4,0,0.677020,18,2,,other and very serious about it,...,0,0,0,0,1,0,0,0,0,2


# Pets

In [99]:
# Extract all pets categories
# todo: automate

PETS_CATS = [
    'has cats', 'likes cats', 'dislikes cats']

PETS_DOGS = [
    'has dogs', 'likes dogs', 'dislikes dogs']

In [137]:
# Remove nan's
df.dropna(inplace=True, subset=['pets'])


pets_encoded_header = ['pets_cats', 'pets_dogs']

# Add col header
for pets_col in pets_encoded_header:
    df[pets_col] = np.nan

# Filer
def filter_pets_cats(row_pets):    
    # compare all extracted to current row in df
    for relation in PETS_CATS:
        if relation in row_pets:
            # match
            return relation
    # no match
    return np.nan

# Filer
def filter_pets_dogs(row_pets):    
    # compare all extracted to current row in df
    for relation in PETS_DOGS:
        if relation in row_pets:
            # match
            return relation
    # no match
    return np.nan


# Hot encoding for both offspring cols
df['pets_cats'] = df.apply(lambda x: filter_pets_cats(x['pets']), axis=1)
df['pets_dogs'] = df.apply(lambda x: filter_pets_dogs(x['pets']), axis=1)

df.dropna(inplace=True, subset=['pets_cats'])
df.dropna(inplace=True, subset=['pets_dogs'])


# Encode pets_cats
pets_cats_encoder = LabelEncoder()
pets_cats_encoder.fit(df['pets_cats'])
encoded_col_pets_cats = pets_cats_encoder.transform(df['pets_cats'])
df['pets_cats'] = encoded_col_pets_cats

# Encode pets_dogs
pets_dogs_encoder = LabelEncoder()
pets_dogs_encoder.fit(df['pets_dogs'])
encoded_col_pets_dogs = pets_dogs_encoder.transform(df['pets_dogs'])
df['pets_dogs'] = encoded_col_pets_dogs


# Drop reduandant cols
df = df.drop('pets', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,offspring,orientation,...,speaks,status,education_status_extracted,education_institution_extracted,religion_type,religion_modifier,sign_extracted,sign_modifier_extracted,pets_cats,pets_dogs
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,"doesn&rsquo;t have kids, but might want them",straight,...,english,single,2,0,agnosticism,very serious about it,gemini,,likes cats,likes dogs
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,"doesn&rsquo;t have kids, but might want them",straight,...,"english (fluently), spanish (poorly), french (...",single,2,6,agnosticism,not too serious about it,cancer,,likes cats,likes dogs
7,30,average,mostly anything,socially,never,white,65.0,artistic / musical / writer,"doesn&rsquo;t have kids, but wants them",straight,...,"english, spanish (okay)",single,1,0,christianity,,sagittarius,,likes cats,likes dogs
14,29,thin,mostly anything,socially,never,"hispanic / latin, white",62.0,other,"doesn&rsquo;t have kids, but wants them",straight,...,english,single,2,0,catholicism,,taurus,,has cats,likes dogs
19,34,athletic,mostly anything,socially,never,white,72.0,science / tech / engineering,doesn&rsquo;t have kids,straight,...,english (fluently),single,1,3,catholicism,not too serious about it,pisces,and it's fun to think about,likes cats,likes dogs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59896,25,curvy,mostly vegetarian,often,sometimes,white,69.0,education / academia,doesn&rsquo;t have kids,straight,...,english,available,1,0,agnosticism,,sagittarius,but it doesn't matter,likes cats,likes dogs
59902,25,curvy,mostly other,rarely,never,"native american, hispanic / latin, white",67.0,student,"doesn&rsquo;t have kids, but might want them",straight,...,english,single,1,6,other,not too serious about it,scorpio,and it's fun to think about,has cats,has dogs
59913,29,full figured,mostly anything,socially,never,black,64.0,executive / management,doesn&rsquo;t have kids,straight,...,"english (fluently), spanish (poorly)",single,1,0,agnosticism,laughing about it,libra,but it doesn't matter,likes cats,likes dogs
59942,24,fit,mostly anything,often,sometimes,"white, other",72.0,entertainment / media,doesn&rsquo;t have kids,straight,...,english (fluently),single,2,0,agnosticism,,leo,but it doesn't matter,likes cats,likes dogs


# Religion

In [133]:
# Extract all offspring categories
# todo: automate

# Extract all religion categories
# Get all distinct values for the religion  col
religion = df.religion.unique()

# Clean
religion = [r for r in religion if str(r) != 'nan'] # remove nan values

# Extract all religion types
religion_types = []
religion_modifiers = [] 
for r in religion:
    # extraxt first half: up to 'and' or 'but'
    if 'and' in r:
        religion_extracted = r.split('and')[0]
    elif 'but' in r:
        religion_extracted = r.split('but')[0]
    else:
        religion_extracted = r
    religion_types.append(religion_extracted)
   
for r in religion:
    # extraxt first half: up to 'and' or 'but'
    if 'and' in r:
        religion_modifier_extracted = r.split('and')[1]
    elif 'but' in r:
        religion_modifier_extracted = r.split('but')[1]
    
    religion_modifiers.append(religion_modifier_extracted)


religion_types = [*set(religion_types)] # create list of "base" religions


religion_modifiers = [*set(religion_modifiers)] # create list of religion modifiers


RELIGION_TYPES = religion_types


RELIGION_MODIFIERS = religion_modifiers

print(RELIGION_TYPES)
print(RELIGION_MODIFIERS)

['judaism ', 'atheism ', 'buddhism', 'buddhism ', 'christianity', 'atheism', 'christianity ', 'agnosticism', 'islam', 'catholicism', 'agnosticism ', 'islam ', 'hinduism', 'hinduism ', 'other', 'judaism', 'catholicism ', 'other ']
[' laughing about it', ' somewhat serious about it', ' not too serious about it', ' very serious about it']


In [134]:
# Remove nan's
df.dropna(inplace=True, subset=['religion'])

relgion_encoded_header = ['religion_type', 'religion_modifier']

# Add col header
for rel_col in relgion_encoded_header:
    df[rel_col] = np.nan

# Filer
def filter_religion_type(row_religion):    
    # compare all extracted to current row in df
    for type in RELIGION_TYPES:
        if type in row_religion:
            # match
            return type
    # no match
    return np.nan

# Filter
def filter_religion_modifier(row_religion):    
    # compare all extracted to current row in df
    for relmodifier in RELIGION_MODIFIERS:
        if relmodifier in row_religion:
            # match
            return relmodifier
    # no match
    return np.nan

# Hot encoding for both offspring cols
df['religion_type'] = df.apply(lambda x: filter_religion_type(x['religion']), axis=1)
df['religion_modifier'] = df.apply(lambda x: filter_religion_modifier(x['religion']), axis=1)

################## COMMENT OUT FOR FRONTEND
df.dropna(inplace=True, subset=['religion_type'])
df.dropna(inplace=True, subset=['religion_modifier'])


# Encode religion_type
religion_type_encoder = LabelEncoder()
religion_type_encoder.fit(df['religion_type'])
encoded_col_religion_type = religion_type_encoder.transform(df['religion_type'])
df['religion_type'] = encoded_col_religion_type

# Encode religion_modifier
religion_modifier_encoder = LabelEncoder()
religion_modifier_encoder.fit(df['religion_modifier'])
encoded_col_religion_modifier = religion_modifier_encoder.transform(df['religion_modifier'])
df['religion_modifier'] = encoded_col_religion_modifier


# Drop reduandant cols
df = df.drop('religion', axis=1)

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,offspring,orientation,pets,sex,sign,smokes,speaks,status,education_status_extracted,education_institution_extracted,religion_type,religion_modifier
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,m,gemini,sometimes,english,single,2,0,agnosticism,very serious about it
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,2,6,agnosticism,not too serious about it
7,30,average,mostly anything,socially,never,white,65.0,artistic / musical / writer,"doesn&rsquo;t have kids, but wants them",straight,likes dogs and likes cats,f,sagittarius,no,"english, spanish (okay)",single,1,0,christianity,
14,29,thin,mostly anything,socially,never,"hispanic / latin, white",62.0,other,"doesn&rsquo;t have kids, but wants them",straight,likes dogs and has cats,f,taurus,no,english,single,2,0,catholicism,
19,34,athletic,mostly anything,socially,never,white,72.0,science / tech / engineering,doesn&rsquo;t have kids,straight,likes dogs and likes cats,m,pisces and it&rsquo;s fun to think about,no,english (fluently),single,1,3,catholicism,not too serious about it
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59904,32,a little extra,mostly other,not at all,never,white,66.0,clerical / administrative,has kids,straight,likes cats,f,cancer,no,english (fluently),single,1,7,other,very serious about it
59907,25,skinny,anything,socially,sometimes,white,72.0,artistic / musical / writer,doesn&rsquo;t have kids,straight,likes dogs,m,taurus and it&rsquo;s fun to think about,sometimes,"english (fluently), spanish (okay)",single,1,0,atheism,somewhat serious about it
59913,29,full figured,mostly anything,socially,never,black,64.0,executive / management,doesn&rsquo;t have kids,straight,likes dogs and likes cats,f,libra but it doesn&rsquo;t matter,no,"english (fluently), spanish (poorly)",single,1,0,agnosticism,laughing about it
59942,24,fit,mostly anything,often,sometimes,"white, other",72.0,entertainment / media,doesn&rsquo;t have kids,straight,likes dogs and likes cats,m,leo but it doesn&rsquo;t matter,no,english (fluently),single,2,0,agnosticism,


# Sex

In [103]:
# Remove nan's
df.dropna(inplace=True, subset=['sex'])

# Encode drugs modifier
sex_encoder = LabelEncoder()
sex_encoder.fit(df['sex'])
encoded_col_sex = sex_encoder.transform(df['sex'])
df['sex'] = encoded_col_sex
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,sex,sign,...,ethnicities_black,ethnicities_other,ethnicities_pacific_islander,ethnicities_middle_eastern,offspring_status,offspring_future,pets_cats,pets_dogs,religion_type,religion_modifier
0,-1.089092,0,0,4,0,1.690080,19,2,1,gemini,...,0,0,0,0,0,1,1,1,0,3
1,0.386157,2,3,2,2,0.423755,8,2,1,cancer,...,0,0,0,0,0,1,1,1,0,1
22,-0.140718,4,0,4,0,0.170490,7,2,1,sagittarius but it doesn&rsquo;t matter,...,0,0,0,0,0,1,1,1,0,2
36,0.070032,1,0,4,0,0.170490,17,2,1,cancer and it&rsquo;s fun to think about,...,0,0,0,0,0,2,1,1,2,1
72,0.175407,1,0,4,2,1.183550,0,2,1,pisces and it&rsquo;s fun to think about,...,0,0,0,0,0,1,1,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,-0.667592,10,0,4,0,-0.336040,17,2,0,aquarius but it doesn&rsquo;t matter,...,0,0,0,0,0,1,1,1,4,3
59838,-0.035343,4,0,4,2,-0.082775,10,2,1,taurus and it&rsquo;s fun to think about,...,0,1,0,0,0,1,1,1,7,0
59852,-0.456842,3,0,4,0,-0.589305,2,0,0,sagittarius and it&rsquo;s fun to think about,...,0,0,0,0,0,1,1,1,0,1
59902,-0.772967,3,3,3,0,-0.336040,18,2,0,scorpio and it&rsquo;s fun to think about,...,0,0,0,0,0,1,0,0,8,1


# Sign

In [136]:
# Remove nan's
df.dropna(inplace=True, subset=['sign'])


# Extract only sign
df['sign_extracted'] = df['sign'].str.split(' ').str[0]

# Extract sign modifier
df['sign_modifier_extracted'] = df['sign'].str.split(' ').str[1:]
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].apply(lambda y: '' if len(y)==0 else y) # replace empty lists with ''
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].apply(lambda y: ' '.join(y) if len(y)!=0 else y) # join list of strings together
df['sign_modifier_extracted'] = df['sign_modifier_extracted'].str.replace(ZODIAC_STRING_REPLACMENT,'\'')  # replace 

################## COMMENT OUT FOR FRONTEND
# Encode sign
sign_encoder = LabelEncoder()
sign_encoder.fit(df['sign_extracted'])
encoded_col_sign = sign_encoder.transform(df['sign_extracted'])
df['sign_extracted'] = encoded_col_sign

# Encode sign modifier
sign_modifier_encoder = LabelEncoder()
sign_modifier_encoder.fit(df['sign_modifier_extracted'])
encoded_col_sign_modifier = sign_modifier_encoder.transform(df['sign_modifier_extracted'])
df['sign_modifier_extracted'] = encoded_col_sign_modifier

# Drop reduandant cols
df = df.drop('sign', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,offspring,orientation,...,sex,smokes,speaks,status,education_status_extracted,education_institution_extracted,religion_type,religion_modifier,sign_extracted,sign_modifier_extracted
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,"doesn&rsquo;t have kids, but might want them",straight,...,m,sometimes,english,single,2,0,agnosticism,very serious about it,gemini,
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,"doesn&rsquo;t have kids, but might want them",straight,...,m,no,"english (fluently), spanish (poorly), french (...",single,2,6,agnosticism,not too serious about it,cancer,
7,30,average,mostly anything,socially,never,white,65.0,artistic / musical / writer,"doesn&rsquo;t have kids, but wants them",straight,...,f,no,"english, spanish (okay)",single,1,0,christianity,,sagittarius,
14,29,thin,mostly anything,socially,never,"hispanic / latin, white",62.0,other,"doesn&rsquo;t have kids, but wants them",straight,...,f,no,english,single,2,0,catholicism,,taurus,
19,34,athletic,mostly anything,socially,never,white,72.0,science / tech / engineering,doesn&rsquo;t have kids,straight,...,m,no,english (fluently),single,1,3,catholicism,not too serious about it,pisces,and it's fun to think about
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59904,32,a little extra,mostly other,not at all,never,white,66.0,clerical / administrative,has kids,straight,...,f,no,english (fluently),single,1,7,other,very serious about it,cancer,
59907,25,skinny,anything,socially,sometimes,white,72.0,artistic / musical / writer,doesn&rsquo;t have kids,straight,...,m,sometimes,"english (fluently), spanish (okay)",single,1,0,atheism,somewhat serious about it,taurus,and it's fun to think about
59913,29,full figured,mostly anything,socially,never,black,64.0,executive / management,doesn&rsquo;t have kids,straight,...,f,no,"english (fluently), spanish (poorly)",single,1,0,agnosticism,laughing about it,libra,but it doesn't matter
59942,24,fit,mostly anything,often,sometimes,"white, other",72.0,entertainment / media,doesn&rsquo;t have kids,straight,...,m,no,english (fluently),single,2,0,agnosticism,,leo,but it doesn't matter


# Smokes

In [105]:
# Remove nan's
df.dropna(inplace=True, subset=['smokes'])

# Encode smokes modifier
smokes_encoder = LabelEncoder()
smokes_encoder.fit(df['smokes'])
encoded_col_smokes = smokes_encoder.transform(df['smokes'])
df['smokes'] = encoded_col_smokes
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,sex,smokes,...,ethnicities_pacific_islander,ethnicities_middle_eastern,offspring_status,offspring_future,pets_cats,pets_dogs,religion_type,religion_modifier,sign_extracted,sign_modifier_extracted
0,-1.089092,0,0,4,0,1.690080,19,2,1,1,...,0,0,0,1,1,1,0,3,4,0
1,0.386157,2,3,2,2,0.423755,8,2,1,0,...,0,0,0,1,1,1,0,1,2,0
22,-0.140718,4,0,4,0,0.170490,7,2,1,0,...,0,0,0,1,1,1,0,2,8,3
36,0.070032,1,0,4,0,0.170490,17,2,1,3,...,0,0,0,2,1,1,2,1,2,2
72,0.175407,1,0,4,2,1.183550,0,2,1,0,...,0,0,0,1,1,1,0,3,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,-0.667592,10,0,4,0,-0.336040,17,2,0,0,...,0,0,0,1,1,1,4,3,0,3
59838,-0.035343,4,0,4,2,-0.082775,10,2,1,0,...,0,0,0,1,1,1,7,0,10,2
59852,-0.456842,3,0,4,0,-0.589305,2,0,0,0,...,0,0,0,1,1,1,0,1,8,2
59902,-0.772967,3,3,3,0,-0.336040,18,2,0,4,...,0,0,0,1,0,0,8,1,9,2


# Speaks

In [131]:
# Remove nan's
df.dropna(inplace=True, subset=['speaks'])

languages = df.speaks.unique()

language = []
language_level = []

for l in languages:
    entries = l.split(', ')
    for e in entries:

        # at least on entry that has a modifier
        if e.find('(') != -1:
            # extract modifier
            res = e[e.find('(')+1:e.find(')')]
            
            # check if modifier can be appended
            if res not in language_level:
                language_level.append(res)
            
            # check if language can be appended
            if e[:e.find(' ')]:
                if e[:e.find(' ')] not in language:
                    language.append(e[:e.find(' ')])
        
        # no modifier
        else:
            # check if language can be appended
            if e not in language:
                language.append(e)



SPEAKS_LANGUAGE = language

SPEAKS_LANGUAGE_LEVEL = language_level
print(SPEAKS_LANGUAGE)
print(SPEAKS_LANGUAGE_LEVEL)

['english', 'spanish', 'french', 'portuguese', 'hebrew', 'yiddish', 'tagalog', 'russian', 'croatian', 'bengali', 'italian', 'vietnamese', 'c++', 'german', 'japanese', 'norwegian', 'swedish', 'latin', 'chinese', 'other', 'esperanto', 'sign', 'hawaiian', 'lisp', 'korean', 'ukrainian', 'sanskrit', 'thai', 'hindi', 'breton', 'ancient', 'greek', 'sign language', 'arabic', 'farsi', 'urdu', 'tamil', 'romanian', 'finnish', 'bulgarian', 'khmer', 'turkish', 'dutch', 'indonesian', 'danish', 'estonian', 'polish', 'catalan', 'gujarati', 'hungarian', 'czech', 'icelandic', 'malay', 'serbian', 'afrikaans', 'swahili', 'mongolian', 'maori', 'latvian', 'irish', 'rotuman', 'ilongo', 'persian', 'tibetan', 'cebuano', 'frisian', 'occitan', 'belarusan', 'armenian', 'lithuanian', 'slovak', 'georgian', 'ancient greek', 'slovenian', 'welsh', 'albanian', 'basque', 'chechen']
['fluently', 'poorly', 'okay']


In [107]:
speaks_encoded_header = [l.replace(' ', '_') for l in SPEAKS_LANGUAGE]

# Add col header
for speaks_col in speaks_encoded_header:
    df['speaks_'+speaks_col] = np.nan

speaks_encoded_header = ['speaks_'+l for l in speaks_encoded_header]
speaks_encoded_header = [l.replace(' ', '_') for l in speaks_encoded_header]


# Filter
def filter_speaks(s, row_speaks):    
    # compare all extracted to current row in df

    # split string into list of multiple langues + modifier
    rs = row_speaks.split(', ')

    # check if language s (current col) is in this list
    res = [i for i in rs if s in i]
    if len(res) != 0:
        # modifier:
        if '(fluently)' in res[0]:
            return 4
        if '(ok)' in res[0]:
            return 3
        if '(poorly)' in res[0]:
            return 1
        else:
            return 2
    else:
        return 0 # maybe change to np.nan


# Hot encoding for all speaks cols
for (speaks_encoded_header_col, s) in zip(speaks_encoded_header, SPEAKS_LANGUAGE):
    df[speaks_encoded_header_col] = df.apply(lambda x: filter_speaks(s, x['speaks']), axis=1)


# Drop reduandant cols
df = df.drop('speaks', axis=1)
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,sex,smokes,...,speaks_welsh,speaks_khmer,speaks_ilongo,speaks_cebuano,speaks_malay,speaks_albanian,speaks_bulgarian,speaks_basque,speaks_occitan,speaks_finnish
0,-1.089092,0,0,4,0,1.690080,19,2,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0.386157,2,3,2,2,0.423755,8,2,1,0,...,0,0,0,0,0,0,0,0,0,0
22,-0.140718,4,0,4,0,0.170490,7,2,1,0,...,0,0,0,0,0,0,0,0,0,0
36,0.070032,1,0,4,0,0.170490,17,2,1,3,...,0,0,0,0,0,0,0,0,0,0
72,0.175407,1,0,4,2,1.183550,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,-0.667592,10,0,4,0,-0.336040,17,2,0,0,...,0,0,0,0,0,0,0,0,0,0
59838,-0.035343,4,0,4,2,-0.082775,10,2,1,0,...,0,0,0,0,0,0,0,0,0,0
59852,-0.456842,3,0,4,0,-0.589305,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59902,-0.772967,3,3,3,0,-0.336040,18,2,0,4,...,0,0,0,0,0,0,0,0,0,0


# Status

In [108]:
# Remove nan's
df.dropna(inplace=True, subset=['status'])

# Encode drugs modifier
status_encoder = LabelEncoder()
status_encoder.fit(df['status'])
encoded_col_status = status_encoder.transform(df['status'])
df['status'] = encoded_col_status
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,sex,smokes,...,speaks_welsh,speaks_khmer,speaks_ilongo,speaks_cebuano,speaks_malay,speaks_albanian,speaks_bulgarian,speaks_basque,speaks_occitan,speaks_finnish
0,-1.089092,0,0,4,0,1.690080,19,2,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0.386157,2,3,2,2,0.423755,8,2,1,0,...,0,0,0,0,0,0,0,0,0,0
22,-0.140718,4,0,4,0,0.170490,7,2,1,0,...,0,0,0,0,0,0,0,0,0,0
36,0.070032,1,0,4,0,0.170490,17,2,1,3,...,0,0,0,0,0,0,0,0,0,0
72,0.175407,1,0,4,2,1.183550,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,-0.667592,10,0,4,0,-0.336040,17,2,0,0,...,0,0,0,0,0,0,0,0,0,0
59838,-0.035343,4,0,4,2,-0.082775,10,2,1,0,...,0,0,0,0,0,0,0,0,0,0
59852,-0.456842,3,0,4,0,-0.589305,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59902,-0.772967,3,3,3,0,-0.336040,18,2,0,4,...,0,0,0,0,0,0,0,0,0,0


# Export

In [109]:
### Desired format
# {name: "age", sets: [19,26,34...]},
# {name: "job", sets: ['B', 'C', 'D']},
# {name: "sign", sets: ["ariel", "batman", "idc"...]},

In [110]:
df.T.to_json('df_cleansed_removed_income_split.json', orient='split')
# df.T.to_json('df_cleansed_removed_income_values.json', orient='values')

In [111]:
f = open('df_cleansed_removed_income_split.json')
data = json.load(f)

master_data = []
for index, attribute in enumerate(data['index']):
    dict = {}
    dict["name"] = attribute
    dict["sets"] = data['data'][index]
    master_data.append(dict)
master_data

[{'name': 'age',
  'sets': [-1.0890915457,
   0.3861565234,
   -0.140717787,
   0.0700319372,
   0.1754067992,
   -0.7729669595,
   -0.5622172353,
   -0.4568423732,
   0.3861565234,
   0.0700319372,
   -0.140717787,
   -0.3514675111,
   0.1754067992,
   0.7022811096,
   1.5452800062,
   3.1259029374,
   2.7044034891,
   1.0184056959,
   1.6506548683,
   -0.9837166836,
   2.8097783512,
   -1.4052161319,
   1.8614045925,
   1.5452800062,
   -0.7729669595,
   -0.2460926491,
   -1.4052161319,
   -0.6675920974,
   -0.0353429249,
   1.22915542,
   0.4915313855,
   -0.140717787,
   1.5452800062,
   -0.2460926491,
   -0.5622172353,
   -0.7729669595,
   -1.2998412698,
   -0.140717787,
   -1.1944664078,
   -0.5622172353,
   -0.8783418215,
   0.3861565234,
   0.7022811096,
   0.9130308338,
   -1.0890915457,
   -0.4568423732,
   -0.5622172353,
   0.3861565234,
   -0.140717787,
   -0.5622172353,
   -0.2460926491,
   -0.9837166836,
   -0.7729669595,
   -1.4052161319,
   -0.8783418215,
   0.596906247

In [112]:
with open('data.txt', 'w') as f:
    for line in master_data:
        f.write(f"{line},\n")

In [113]:
df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,job,orientation,sex,smokes,...,speaks_welsh,speaks_khmer,speaks_ilongo,speaks_cebuano,speaks_malay,speaks_albanian,speaks_bulgarian,speaks_basque,speaks_occitan,speaks_finnish
0,-1.089092,0,0,4,0,1.690080,19,2,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0.386157,2,3,2,2,0.423755,8,2,1,0,...,0,0,0,0,0,0,0,0,0,0
22,-0.140718,4,0,4,0,0.170490,7,2,1,0,...,0,0,0,0,0,0,0,0,0,0
36,0.070032,1,0,4,0,0.170490,17,2,1,3,...,0,0,0,0,0,0,0,0,0,0
72,0.175407,1,0,4,2,1.183550,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,-0.667592,10,0,4,0,-0.336040,17,2,0,0,...,0,0,0,0,0,0,0,0,0,0
59838,-0.035343,4,0,4,2,-0.082775,10,2,1,0,...,0,0,0,0,0,0,0,0,0,0
59852,-0.456842,3,0,4,0,-0.589305,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59902,-0.772967,3,3,3,0,-0.336040,18,2,0,4,...,0,0,0,0,0,0,0,0,0,0


In [175]:
df_someAttempt = df.copy()

In [176]:
df_someAttempt =df_someAttempt[['age', 'height', 'body_type', 'diet', 'drinks', 'drugs', 'orientation', 'sex', 'smokes', 'status', 'religion_type', 'job', 'sign_extracted', 'pets_cats', 'pets_dogs', 'offspring_status', 'offspring_future'  ]] #offspring / job /sign

In [128]:
df = df.drop(['income'], axis=1).dropna()

In [174]:
df.dropna()

Unnamed: 0,age,body_type,diet,drinks,drugs,ethnicity,height,job,orientation,sex,...,education_status_extracted,education_institution_extracted,religion_type,religion_modifier,sign_extracted,sign_modifier_extracted,pets_cats,pets_dogs,offspring_status,offspring_future
0,22,a little extra,strictly anything,socially,never,"asian, white",75.0,transportation,straight,m,...,2,0,agnosticism,very serious about it,gemini,,likes cats,likes dogs,doesn't have kids,might want
1,36,average,mostly other,often,sometimes,white,70.0,hospitality / travel,straight,m,...,2,6,agnosticism,not too serious about it,cancer,,likes cats,likes dogs,doesn't have kids,might want
22,31,fit,mostly anything,socially,never,white,69.0,executive / management,straight,m,...,1,0,agnosticism,somewhat serious about it,sagittarius,but it doesn't matter,likes cats,likes dogs,doesn't have kids,might want
36,33,athletic,mostly anything,socially,never,white,69.0,science / tech / engineering,straight,m,...,1,0,buddhism,not too serious about it,cancer,and it's fun to think about,likes cats,likes dogs,doesn't have kids,wants
72,34,athletic,mostly anything,socially,sometimes,"native american, white",73.0,artistic / musical / writer,straight,m,...,1,6,agnosticism,very serious about it,pisces,and it's fun to think about,likes cats,likes dogs,doesn't have kids,might want
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59824,26,thin,mostly anything,socially,never,white,67.0,science / tech / engineering,straight,f,...,1,0,christianity,very serious about it,aquarius,but it doesn't matter,likes cats,likes dogs,doesn't have kids,might want
59838,32,fit,mostly anything,socially,sometimes,"white, other",68.0,medicine / health,straight,m,...,1,5,judaism,laughing about it,taurus,and it's fun to think about,likes cats,likes dogs,doesn't have kids,might want
59852,28,curvy,anything,socially,never,white,66.0,clerical / administrative,bisexual,f,...,2,3,agnosticism,not too serious about it,sagittarius,and it's fun to think about,likes cats,likes dogs,doesn't have kids,might want
59902,25,curvy,mostly other,rarely,never,"native american, hispanic / latin, white",67.0,student,straight,f,...,1,6,other,not too serious about it,scorpio,and it's fun to think about,has cats,has dogs,doesn't have kids,might want


In [177]:
df_someAttempt = df_someAttempt.dropna().reset_index().drop(['index'], axis=1)

In [151]:
df_someAttempt.to_json("test2.json", orient="split")

In [152]:
df_someAttempt.to_json("test3.json", orient="records")

In [190]:
compString = ''
for index, attribute in df_someAttempt.iterrows():
    dataString = f'{{ "name" : "Person{index}", "age": {attribute.age}, "sex": "{attribute.sex}", "height": {attribute.height}, "sets": ['
    for i, a in enumerate(attribute):
        if (i != 0 and i != 1 and i != 7):
            dataString += f'"{attribute.index[i]} - {a}"'
        if i != len(attribute):
            dataString += ','
    dataString += ']},'
    compString += dataString
    with open('data_file.json', 'a') as f:
        f.write(dataString + "\n")


In [None]:


with open('data.txt', 'w') as f:
    for line in master_data:
        f.write(f"{compString},\n")