In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

print(df.horsepower.describe())

count    391.000000
mean     104.404092
std       38.518732
min       46.000000
25%       75.000000
50%       93.000000
75%      125.000000
max      230.000000
Name: horsepower, dtype: float64


In [2]:
df.horsepower = df.horsepower / abs(df.horsepower).max()  # 음수가 있을 때는 abs() 사용하여 정규화
df.horsepower.head()

0    0.717391
1    0.652174
2    0.652174
3    0.608696
4    0.860870
Name: horsepower, dtype: float64

In [3]:
df.horsepower.describe()

count    391.000000
mean       0.453931
std        0.167473
min        0.200000
25%        0.326087
50%        0.404348
75%        0.543478
max        1.000000
Name: horsepower, dtype: float64

In [4]:
# min-max 스케일링
# (모든 원소 - 최소값) / (최대값 - 최소값)

import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

print(df.horsepower.describe())

count    391.000000
mean     104.404092
std       38.518732
min       46.000000
25%       75.000000
50%       93.000000
75%      125.000000
max      230.000000
Name: horsepower, dtype: float64


In [5]:
min_x = df.horsepower - df.horsepower.min()
min_max = df.horsepower.max() - df.horsepower.min()
df.horsepower = min_x / min_max

df.horsepower.describe()

count    391.000000
mean       0.317414
std        0.209341
min        0.000000
25%        0.157609
50%        0.255435
75%        0.429348
max        1.000000
Name: horsepower, dtype: float64

In [6]:
df.horsepower.head()

0    0.646739
1    0.565217
2    0.565217
3    0.510870
4    0.826087
Name: horsepower, dtype: float64

In [8]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

scaler = MinMaxScaler()
df.horsepower = scaler.fit_transform(df[['horsepower']])
df.horsepower.head()

0    0.646739
1    0.565217
2    0.565217
3    0.510870
4    0.826087
Name: horsepower, dtype: float64

In [10]:
# z-score 스케일링

import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

mean = np.mean(df['horsepower'], axis=0)
print(mean)
print()
print(df['horsepower'].describe())

104.40409207161126

count    391.000000
mean     104.404092
std       38.518732
min       46.000000
25%       75.000000
50%       93.000000
75%      125.000000
max      230.000000
Name: horsepower, dtype: float64


In [12]:
# 표준편차
std1 = np.std(df['horsepower'])
std2 = df['horsepower'].std()
print(std1)
print(std2)

38.46944359711783
38.51873182181278


In [16]:
mean_x = df['horsepower'] - mean
df.horsepower = mean_x / std2
print(df.horsepower.head())

0    1.573154
1    1.183733
2    1.183733
3    0.924119
4    2.429880
Name: horsepower, dtype: float64


In [17]:
print(df.horsepower.describe())

count    3.910000e+02
mean    -1.453796e-16
std      1.000000e+00
min     -1.516252e+00
25%     -7.633712e-01
50%     -2.960661e-01
75%      5.346985e-01
max      3.260645e+00
Name: horsepower, dtype: float64


In [18]:
import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df.horsepower = scaler.fit_transform(df[['horsepower']])
print(df.horsepower.head())

0    1.575170
1    1.185250
2    1.185250
3    0.925303
4    2.432994
Name: horsepower, dtype: float64


In [19]:
df.horsepower.describe()

count    3.910000e+02
mean    -2.180694e-16
std      1.001281e+00
min     -1.518194e+00
25%     -7.643493e-01
50%     -2.964455e-01
75%      5.353836e-01
max      3.264823e+00
Name: horsepower, dtype: float64

In [21]:
# Robust 스케일링
# 중앙값(50%), IQR

import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

median = df['horsepower'].median()
print(median)
print()
print(df.horsepower.describe())

93.0

count    391.000000
mean     104.404092
std       38.518732
min       46.000000
25%       75.000000
50%       93.000000
75%      125.000000
max      230.000000
Name: horsepower, dtype: float64


In [22]:
median_x = df['horsepower'] - median
print((median_x.median()))

0.0


In [23]:
iqr_75 = np.percentile(df['horsepower'], 75)
iqr_25 = np.percentile(df['horsepower'], 25)
iqr = iqr_75 - iqr_25
iqr

50.0

In [24]:
df.horsepower = median_x / iqr
print(df.horsepower.head())

0    1.44
1    1.14
2    1.14
3    0.94
4    2.10
Name: horsepower, dtype: float64


In [25]:
import pandas as pd
import numpy as np

df = pd.read_excel('/content/auto-mpg.xlsx')

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'model_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace = True)
df.dropna(subset = ['horsepower'], inplace = True)
df['horsepower'] = df['horsepower'].astype('float')

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
df.horsepower = scaler.fit_transform(df[['horsepower']])
print(df.horsepower.head())

0    1.44
1    1.14
2    1.14
3    0.94
4    2.10
Name: horsepower, dtype: float64
