In [1]:
import pandas as pd
import random
from sklearn.impute import SimpleImputer
from num2words import num2words
from scipy.stats import truncnorm
from transformers_ import HeightWeightHandler, AgeExtractor
from utils import *

pd.set_option('display.max_rows', 500)
df = pd.DataFrame()

In [2]:
df['age'] = generate_age_column()
df['height_weight'] = generate_height_weight_column()
df['transportation'] = generate_categorical_column(['Automobile', 'Public_Transportation', 'Motorbike', 'Walking', 'Bike'])
df['recorded'] = generate_yes_no_column()
df['price'] = generate_missing_numeric_column()
df['walked'] = generate_missing_numeric_column(1,10)
df['eaten'] = generate_missing_numeric_column(1,3)

In [3]:
df.head(10)

Unnamed: 0,age,height_weight,transportation,recorded,price,walked,eaten
0,4,1.8836004374130257_eighty,Automobile,yes,9482.0,7.0,2.0
1,twenty-eight,2.021116488237775;79,Walking,no,5485.0,8.0,
2,fifty-seven,"1.4602621272368825,129",Motorbike,no,,,
3,88,1.7875329886147826_forty-seven,Automobile,no,3786.0,2.0,
4,one,1.8561218603022664_119,Walking,yes,8224.0,,
5,thirty-eight,1.806542615566776;one hundred and forty-one,Walking,yes,,4.0,3.0
6,75,1.5125575028980736-one hundred and six,Motorbike,no,6609.0,2.0,
7,forty-one,1.8124652741654181-one hundred and seventeen,Automobile,no,,,
8,56,"1.7601574187688565,115",Motorbike,no,7025.0,,
9,49,1.9320717051853578-145,Walking,no,5260.0,,


In [4]:
df.isnull().sum()

age                0
height_weight      0
transportation     0
recorded           0
price             50
walked            56
eaten             58
dtype: int64

In [4]:
mean_imputer = SimpleImputer(strategy='mean')
mean_col = ['price', 'walked']
df[mean_col] = mean_imputer.fit_transform(df[mean_col])

const_imputer = SimpleImputer(strategy='constant', fill_value=0)
const_col = ['eaten']
df[const_col] = const_imputer.fit_transform(df[const_col])

In [5]:
age_extracted = AgeExtractor().fit_transform(df['age'])
heights_weights = HeightWeightHandler('height_weight').fit_transform(df)
transport_dummy = pd.get_dummies(df['transportation'])
df['recorded'] = df['recorded'].replace({'no': 0, 'yes': 1})

In [6]:
df.drop(['age', 'height_weight', 'transportation'], axis=1, inplace=True)

In [7]:
df = pd.concat([age_extracted, heights_weights, transport_dummy, df], axis=1)

In [8]:
df.head(10)

Unnamed: 0,age,height,weight,Automobile,Bike,Motorbike,Public_Transportation,Walking,recorded,price,walked,eaten
0,82,1.471906,103.0,False,False,False,False,True,1,8451.0,6.26087,2.0
1,94,1.505534,90.0,False,True,False,False,False,1,5659.0,6.26087,0.0
2,20,1.611056,85.0,False,False,False,False,True,1,2747.0,9.0,2.0
3,64,1.546454,80.0,False,False,False,True,False,0,5659.0,6.26087,0.0
4,29,1.458686,109.0,False,True,False,False,False,1,3851.0,6.26087,2.0
5,15,1.433727,42.0,False,False,True,False,False,0,9223.0,4.0,2.0
6,32,1.340496,131.0,True,False,False,False,False,1,4956.0,6.26087,0.0
7,73,1.338687,128.0,False,True,False,False,False,0,5659.0,4.0,0.0
8,100,2.014705,104.0,False,True,False,False,False,0,5659.0,6.26087,0.0
9,73,1.922467,50.0,False,False,False,True,False,1,5659.0,3.0,0.0
