In [27]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('./mpg.csv')

In [22]:
df['mpg'].head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [23]:
mpg_to_kpl = 1.609344 / 3.78541178
print(mpg_to_kpl)

0.4251437078795164


In [24]:
df['kpl'] = df['mpg'] * mpg_to_kpl
df['kpl'] = df['kpl'].round(1)

In [25]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl
0,18.0,8,307.0,130,3504,12.0,70,usa,chevrolet chevelle malibu,7.7
1,15.0,8,350.0,a,3693,11.5,70,usa,buick skylark 320,6.4
2,18.0,8,318.0,150,3436,11.0,70,usa,plymouth satellite,7.7
3,16.0,8,304.0,150,3433,12.0,70,usa,amc rebel sst,6.8
4,17.0,8,302.0,140,3449,10.5,70,usa,ford torino,7.2


In [26]:
print(df.dtypes)

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
kpl             float64
dtype: object


In [33]:
df['horsepower'].replace('a',np.nan,inplace=True)
df.dropna(subset=['horsepower'],axis=0,inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl
0,18.0,8,307.0,130,3504,12.0,70,usa,chevrolet chevelle malibu,7.7
2,18.0,8,318.0,150,3436,11.0,70,usa,plymouth satellite,7.7
3,16.0,8,304.0,150,3433,12.0,70,usa,amc rebel sst,6.8
4,17.0,8,302.0,140,3449,10.5,70,usa,ford torino,7.2
5,15.0,8,429.0,198,4341,10.0,70,usa,ford galaxie 500,6.4


In [34]:
df['horsepower'] = df['horsepower'].astype('float')

In [35]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
kpl             float64
dtype: object

In [36]:
df['origin'].unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [38]:
df['origin'].replace({'usa':1,'japan':2,'europe':3},inplace=True)

In [40]:
print(df['origin'].unique())
print(df['origin'].dtypes)
print(df['origin'].head())

[1 2 3]
int64
0    1
2    1
3    1
4    1
5    1
Name: origin, dtype: int64


In [46]:
df['origin'] = df['origin'].astype('category')

In [47]:
df.dtypes

mpg              float64
cylinders          int64
displacement     float64
horsepower       float64
weight             int64
acceleration     float64
model_year         int64
origin          category
name              object
kpl              float64
dtype: object

In [48]:
df['model_year'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], dtype=int64)

In [51]:
df['model_year'] = df['model_year'].astype('category')

In [52]:
df['model_year'].unique()

[70, 71, 72, 73, 74, ..., 78, 79, 80, 81, 82]
Length: 13
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]

In [54]:
count, bin_number = np.histogram(df['horsepower'],bins=3)
print(count, bin_number)

[257 102  32] [ 46.         107.33333333 168.66666667 230.        ]


In [58]:
labels = ['low','normal','high']
df['hp_bin'] = pd.cut(x=df['horsepower'],      bins=bin_number,labels=labels)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl,hp_bin
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,7.7,normal
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,7.7,normal
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,6.8,normal
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,7.2,normal
5,15.0,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500,6.4,high


In [60]:
horsepower_dummies = pd.get_dummies(df['hp_bin'])
print(horsepower_bin.dtype)
print(horsepower_dummies)

category
     low  normal  high
0      0       1     0
2      0       1     0
3      0       1     0
4      0       1     0
5      0       0     1
..   ...     ...   ...
393    1       0     0
394    1       0     0
395    1       0     0
396    1       0     0
397    1       0     0

[391 rows x 3 columns]


In [62]:
from sklearn import preprocessing

In [65]:
# label_encoder.fit([1,2,2,6])
# print(label_encoder.transform([1,1,2,6]))
# print(label_encoder.inverse_transform([0,0,1,2]))
label_encoder = preprocessing.LabelEncoder()
one_hot_encoder = preprocessing.OneHotEncoder()

In [66]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl,hp_bin
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,7.7,normal
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,7.7,normal
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,6.8,normal
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,7.2,normal
5,15.0,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500,6.4,high


In [78]:
label_encoder.fit(['low','normal','high'])
encodered_label = label_encoder.transform(df['hp_bin'].head(15))
print(encodered_label)
print(type(encodered_label))

[2 2 2 2 0 0 0 0 0 0 2 2 0 1 1]
<class 'numpy.ndarray'>


In [79]:
onehot_label = encodered_label.reshape(len(encodered_label),1)
print(onehot_label)
print(type(onehot_label))

[[2]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [1]
 [1]]
<class 'numpy.ndarray'>


In [80]:
# 희소 행렬
one_hot_sparse = one_hot_encoder.fit_transform(onehot_label)
print(one_hot_sparse)
print(type(one_hot_sparse))

  (0, 2)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 2)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 2)	1.0
  (11, 2)	1.0
  (12, 0)	1.0
  (13, 1)	1.0
  (14, 1)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
