In [676]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## One-Hot Encoding

In [677]:
dataset = pd.read_csv('data/diamonds01.csv')
dataset = pd.get_dummies(dataset, columns = ['cut','color','clarity'])
dataset = dataset.drop(['depth','table','x','y','z'],axis=1)
dataset

Unnamed: 0,carat,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.38,1433,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,1.56,10210,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1.01,4099,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,0.34,490,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.40,622,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.38,865,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4996,1.70,17492,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4997,0.25,575,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4998,0.32,505,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## X,y 데이터 분할

In [678]:
X1 = dataset.iloc[:,:1]
X2 = dataset.iloc[:,2:]
X = np.concatenate((X1,X2),axis=1)
y = dataset.iloc[:,1]

print(X)
print(X.shape, y.shape)

[[0.38 0.   0.   ... 0.   0.   0.  ]
 [1.56 0.   0.   ... 0.   0.   0.  ]
 [1.01 0.   0.   ... 0.   0.   0.  ]
 ...
 [0.25 0.   0.   ... 0.   0.   1.  ]
 [0.32 0.   0.   ... 0.   0.   0.  ]
 [1.31 0.   0.   ... 0.   0.   0.  ]]
(5000, 21) (5000,)


## Training,Test 데이터 분리

In [679]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [680]:
# shape 확인
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4000, 21)
(4000,)
(1000, 21)
(1000,)


## 1. Linear Regression model

In [681]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train,y_train)

LinearRegression()

## 2. RandomForest Regression model

In [682]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(n_estimators=9, min_samples_split=5,
                          max_features=int(X_train.shape[1]/2), bootstrap = True,
                          random_state =1)
RF.fit(X_train, y_train)

RandomForestRegressor(max_features=10, min_samples_split=5, n_estimators=9,
                      random_state=1)

## 3. Logistic Regression model

In [683]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor(min_samples_split=4, random_state=1)
DT.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_split=4, random_state=1)

## predict

In [684]:
LR_y_pred = LR.predict(X_test)
RF_y_pred = RF.predict(X_test)
DT_y_pred = DT.predict(X_test)

## 모델 정확도

In [685]:
from sklearn.metrics import r2_score

LR_score = r2_score(y_test, LR_y_pred) #Linear Regression Accuracy
RF_score = r2_score(y_test,RF_y_pred) #RandomForest Regression Accuracy 
DT_score = r2_score(y_test,DT_y_pred) #DecisionTree Regression Accuracy 

print(f"Linear Regression Accuracy = {round(LR_score*100,2)}%")
print(f"RandomForest Regression Accuracy = {round(RF_score*100,2)}%")
print(f"DecisionTree Regression Accuracy = {round(DT_score*100,2)}%")

Linear Regression Accuracy = 89.85%
RandomForest Regression Accuracy = 96.38%
DecisionTree Regression Accuracy = 96.1%
