In [1]:
import pandas as pd
import random
from num2words import num2words
pd.set_option('display.max_rows', 500)

def generate_age():
    choice = random.choice(['integer', 'word'])
    if choice == 'integer':
        return random.randint(1, 100)  # Generate random integer age
    elif choice == 'word':
        return num2words(random.randint(1, 100))

data = {'age': [generate_age() for _ in range(100)]}
df = pd.DataFrame(data)

df.head(10)

Unnamed: 0,age
0,seventy
1,37
2,thirty-five
3,83
4,fifty
5,13
6,forty-one
7,55
8,10
9,twenty-five


In [2]:
# df['column_name'] = df['column_name'].replace({'no': 0, 'yes': 1})

In [3]:
from scipy.stats import truncnorm

a = (1.3 - 1.7) / 0.2
b = (2.3 - 1.7) / 0.2 

def generate_height_weight():
    choice = random.choice([';', ',', '-', '_'])
    words = random.choice([True, False])
    height = truncnorm.rvs(a, b, loc=1.7, scale=0.2, size=1)[0]
    weight = random.randrange(40, 150)
    if words:
        weight = num2words(weight)
    return str(height)+choice+str(weight)

data = {'height_weight': [generate_height_weight() for _ in range(100)]}
df['height_weight'] = pd.DataFrame(data)

df.head(10)

Unnamed: 0,age,height_weight
0,seventy,"1.7113774807242084,forty-four"
1,37,2.0413220086751016_one hundred and forty-three
2,thirty-five,1.7115635001684846-51
3,83,1.6303960896024217;90
4,fifty,1.6828359774992987;one hundred and fifteen
5,13,1.7889365772941719;52
6,forty-one,1.8323095413930819_119
7,55,1.7888698372126295_ninety-two
8,10,1.5528475431277362_51
9,twenty-five,1.7671906461054805;116


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X[self.feature_names]
    
class AgeExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.word_to_num = {num2words(i): i for i in range(150)}
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        def convert_to_int(x):
            try:
                return int(x)
            except ValueError:
                return self.word_to_num[x.lower()]
        return X.apply(lambda x: convert_to_int(x))
    
ages = FeatureSelector('age').fit_transform(df)
ages_extracted = AgeExtractor().fit_transform(ages)

In [5]:
class HeightWeightHandler(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.possible_splits = [',', '_', '-', ';']
        self.word_to_num = {num2words(i) : i for i in range(300)}
        self.column_name = column_name
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        def splitter(x):
            for char in x:
                if char in self.possible_splits:
                    height, weight = x.split(char, 1)
                    return height, weight
            return None
        
        def convert_to_int(x): 
            try:
                return int(x)
            except ValueError:
                return self.word_to_num[x.lower()]
        
        def handler(x):
            height, weight = splitter(x)
            weight = convert_to_int(weight)
            height = float(height)
            return pd.Series({'height': height, 'weight': weight})
        
        return X[self.column_name].apply(handler)

heights_weights = HeightWeightHandler('height_weight').fit_transform(df)

In [6]:
pd.concat([ages_extracted, heights_weights], axis=1).head(10)

Unnamed: 0,age,height,weight
0,70,1.711377,44.0
1,37,2.041322,143.0
2,35,1.711564,51.0
3,83,1.630396,90.0
4,50,1.682836,115.0
5,13,1.788937,52.0
6,41,1.83231,119.0
7,55,1.78887,92.0
8,10,1.552848,51.0
9,25,1.767191,116.0


In [7]:
def generate_transp_mode():
    categories = ['Automobile', 'Public_Transportation', 'Motorbike', 'Walking', 'Bike']
    category = random.choice(categories)
    return category

data = {'transportation': [generate_transp_mode() for _ in range(100)]}
df['transportation'] = pd.DataFrame(data)
df.head(5)

Unnamed: 0,age,height_weight,transportation
0,seventy,"1.7113774807242084,forty-four",Bike
1,37,2.0413220086751016_one hundred and forty-three,Motorbike
2,thirty-five,1.7115635001684846-51,Public_Transportation
3,83,1.6303960896024217;90,Walking
4,fifty,1.6828359774992987;one hundred and fifteen,Motorbike


In [8]:
categories = ['Automobile', 'Public_Transportation', 'Motorbike', 'Walking', 'Bike']

In [12]:
t = pd.get_dummies(df['transportation'])

In [13]:
df.drop(['transportation'], axis=1, inplace=True)

In [15]:
df = pd.concat([df, t], axis=1)

In [16]:
df

Unnamed: 0,age,height_weight,Automobile,Bike,Motorbike,Public_Transportation,Walking
0,seventy,"1.7113774807242084,forty-four",False,True,False,False,False
1,37,2.0413220086751016_one hundred and forty-three,False,False,True,False,False
2,thirty-five,1.7115635001684846-51,False,False,False,True,False
3,83,1.6303960896024217;90,False,False,False,False,True
4,fifty,1.6828359774992987;one hundred and fifteen,False,False,True,False,False
5,13,1.7889365772941719;52,True,False,False,False,False
6,forty-one,1.8323095413930819_119,False,False,True,False,False
7,55,1.7888698372126295_ninety-two,False,False,True,False,False
8,10,1.5528475431277362_51,False,False,True,False,False
9,twenty-five,1.7671906461054805;116,False,False,False,True,False
