In [7]:
import os
import pandas as pd
from scipy.stats import zscore
import numpy as np
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv",
    na_values=['NA','?'])

df['mpg'] = zscore(df['mpg'])
display(df[0:5])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.706439,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,-0.706439,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,-0.962647,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,-0.834543,8,302.0,140.0,3449,10.5,70,1,ford torino


In [4]:
#Dummy values
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

display(df[0:5])

areas = list(df['area'].unique())
print(f'Number of areas: {len(areas)}')
print(f'Areas: {areas}')

dummies = pd.get_dummies(df['area'],prefix='area')
print(dummies[0:10]) # Just show the first 10

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


Number of areas: 4
Areas: ['c', 'd', 'a', 'b']
   area_a  area_b  area_c  area_d
0       0       0       1       0
1       0       0       1       0
2       0       0       1       0
3       0       0       1       0
4       0       0       0       1
5       0       0       1       0
6       0       0       0       1
7       1       0       0       0
8       0       0       1       0
9       1       0       0       0


In [5]:
df = pd.concat([df,dummies],axis=1)
df.drop('area', axis=1, inplace=True)
display(df[0:10][['id','job','income','area_a',
                  'area_b','area_c','area_d']])

Unnamed: 0,id,job,income,area_a,area_b,area_c,area_d
0,1,vv,50876.0,0,0,1,0
1,2,kd,60369.0,0,0,1,0
2,3,pe,55126.0,0,0,1,0
3,4,11,51690.0,0,0,1,0
4,5,kl,28347.0,0,0,0,1
5,6,e2,70854.0,0,0,1,0
6,7,kl,38726.0,0,0,0,1
7,8,nb,55162.0,1,0,0,0
8,9,al,67311.0,0,0,1,0
9,10,pe,63344.0,1,0,0,0


In [11]:
#TARGET ECODING
np.random.seed(43)
df = pd.DataFrame({
    'cont_9': np.random.rand(10)*100,
    'x_0': ['a'] * 5 + ['b'] * 5,
    'x_1': ['c'] * 9 + ['d'] * 1,
    'y': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})

display(df)


Unnamed: 0,cont_9,x_0,x_1,y
0,11.505457,a,c,1
1,60.906654,a,c,0
2,13.339096,a,c,1
3,24.058962,a,c,1
4,32.713906,a,c,1
5,85.913749,b,c,1
6,66.609021,b,c,0
7,54.116221,b,c,0
8,2.901382,b,c,0
9,73.37483,b,d,0


In [12]:
#https://maxhalford.github.io/blog/target-encoding/
def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)
#Let’s see what this does in the previous example with a weight of, say, 10.

df['x_0'] = calc_smooth_mean(df, by='x_0', on='y', m=10)
df['x_1'] = calc_smooth_mean(df, by='x_1', on='y', m=10)

In [13]:
df

Unnamed: 0,cont_9,x_0,x_1,y
0,11.505457,0.6,0.526316,1
1,60.906654,0.6,0.526316,0
2,13.339096,0.6,0.526316,1
3,24.058962,0.6,0.526316,1
4,32.713906,0.6,0.526316,1
5,85.913749,0.4,0.526316,1
6,66.609021,0.4,0.526316,0
7,54.116221,0.4,0.526316,0
8,2.901382,0.4,0.526316,0
9,73.37483,0.4,0.454545,0
