In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [91]:
# use PCA to reduce dimensionality of religion/ pets/ etc?
# if I do this, can I map for prediction
# create tone of essays and see if they correlate (PCA here?)
# simplify the over-expressed variables.

In [92]:
df = pd.read_csv('profiles.csv')
df1 = df

In [93]:
df1.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')

In [94]:
# onehotencode m/f

# break off column to OHE - this is done to avoid renaming all of the passthrough columns
temp_df = df1[['sex']]

# OHE encode 
transformer = make_column_transformer((OneHotEncoder(), ['sex']), remainder = 'passthrough', sparse_threshold=0)
transformed = transformer.fit_transform(temp_df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

# rename the hot mess of column names that come from onehotencoder
col_names = {'onehotencoder__sex_f': 'f', 'onehotencoder__sex_m': 'm'}
transformed_df.rename(columns = col_names, inplace = True)

# reassemble and reorder the df
df1 = pd.concat([df1, transformed_df], axis=1)
df1 = df1[['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'f', 'm', 'sign', 'smokes', 'speaks', 'status']]

In [95]:
# onehotencode body_type
# I would love to encode the nan's as 'rather not say' but I better not as they could arise from other issues (web/ UI errors)

# isolate column for ohe
body_temp_df = df1[['body_type']]

# ohe now treats nan's as their own category https://stackoverflow.com/a/72379323

# OHE encode 
b_transformer = make_column_transformer((OneHotEncoder(), ['body_type']), remainder = 'passthrough', sparse_threshold = 0)
b_transformed = b_transformer.fit_transform(body_temp_df)
b_transformed_df = pd.DataFrame(b_transformed, columns=b_transformer.get_feature_names_out())
# rename the hot mess of column names that come from onehotencoder
b_transformed_df = b_transformed_df.rename(columns=lambda x: re.sub('onehotencoder__','',x))

# reassemble and reorder the df
df1 = pd.concat([df1, b_transformed_df], axis=1)

In [96]:
# create diet dummies

# isolate column for ohe
diet_df = df1[['diet']]

# OHE encode
diet_transformer = make_column_transformer((OneHotEncoder(), ['diet']), remainder = 'passthrough', sparse_threshold = 0)
diet_transformed = diet_transformer.fit_transform(diet_df)
diet_transformed_df = pd.DataFrame(diet_transformed, columns = diet_transformer.get_feature_names_out())
                                           
# rename
diet_transformed_df = diet_transformed_df.rename(columns = lambda x: re.sub('onehotencoder__', '', x))

# combine dfs
df1 = pd.concat([df1, diet_transformed_df], axis = 1)
               
# review
# df1.to_csv('test.csv')

In [97]:
# create diet strictness measure

# build decision function for apply
def diet_to_cat(x):
    if 'anything' in x: return 0
    elif 'strictly' in x: return 3
    elif 'mostly' in x: return 1
    else: return 2

# get rid of nan's to avoid float error
df1['diet'].fillna('', inplace=True)

# cast as string to avoid object
df['diet'] = df['diet'].astype('str')

# create numeric variable
df1['diet_strictness'] = df1['diet'].apply(diet_to_cat)

# define as categorical, ordered data
diet_temp = pd.Categorical(df1['diet_strictness'], ordered=True, categories=[0, 1, 2, 3])

# replace data in df
df1['diet_strictness'] = diet_temp

In [107]:
# would be interesting to view the percentages of male and female at the ends of this distribution (desperately/ not at all)
sns.barplot(x = df1['drinks'], data = df1, hue = 'sex')

TypeError: Horizontal orientation requires numeric `x` variable.

In [101]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 66 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   age                       59946 non-null  int64   
 1   body_type                 54650 non-null  object  
 2   diet                      59946 non-null  object  
 3   drinks                    56961 non-null  object  
 4   drugs                     45866 non-null  object  
 5   education                 53318 non-null  object  
 6   essay0                    54458 non-null  object  
 7   essay1                    52374 non-null  object  
 8   essay2                    50308 non-null  object  
 9   essay3                    48470 non-null  object  
 10  essay4                    49409 non-null  object  
 11  essay5                    49096 non-null  object  
 12  essay6                    46175 non-null  object  
 13  essay7                    47495 non-null  obje

In [100]:
# df.replace(r"<[^<]+?>", " ", regex=True, inplace=True)
# df.replace(r"\n", " ", regex=True, inplace=True)
# df.replace(r"&amp;", "&", regex=True, inplace=True)
# df.replace(r"&rsquo;", "'", regex=True, inplace=True)