# 다이아몬드 품질 예측

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## 데이터 탐색

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [4]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [5]:
df.describe(include='category')

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


## 데이터 전처리

1. 범주형 데이터 label encoding으로 변환, 기존의 범주형 변수들 제거
2. 연속형 데이터 standardscaler, minmaxscaler, 단 cut 범주형으로 존재
3. train, test datasets로 나누기
4. DecisionTree model 사용
5. accuracy_score 모델 검증
6. cross_validation 적용
7. accuracy_score 모델 2차 검증

### 범주형 데이터 label encoding으로 변환, 기존의 범주형 변수들 제거

In [6]:
from sklearn.preprocessing import LabelEncoder

L_encoder = LabelEncoder()
for i in df[['cut', 'color', 'clarity']]:
    if i == 'cut':
        L_encoder.fit(df[i])
        label_cut = L_encoder.transform(df[i])
        
    elif i == 'color':
        L_encoder.fit(df[i])
        label_color = L_encoder.transform(df[i])
        
    else:
        L_encoder.fit(df[i])
        label_clarity = L_encoder.transform(df[i])
        
print(label_cut)
print(label_color)
print(label_clarity)



[2 3 1 ... 4 3 2]
[1 1 1 ... 0 4 0]
[3 2 4 ... 2 3 3]


In [7]:
d={'e_color':label_color, 'e_clarity': label_clarity,'e_cut': label_cut,}
encoded_df = pd.DataFrame(data=d)

df = df.drop(columns=['cut', 'color', 'clarity'], axis=1)

df = pd.concat([df,encoded_df], axis=1)
df

Unnamed: 0,carat,depth,table,price,x,y,z,e_color,e_clarity,e_cut
0,0.23,61.5,55.0,326,3.95,3.98,2.43,1,3,2
1,0.21,59.8,61.0,326,3.89,3.84,2.31,1,2,3
2,0.23,56.9,65.0,327,4.05,4.07,2.31,1,4,1
3,0.29,62.4,58.0,334,4.20,4.23,2.63,5,5,3
4,0.31,63.3,58.0,335,4.34,4.35,2.75,6,3,1
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,0,2,2
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,0,2,1
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,0,2,4
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,4,3,3


### 연속형 데이터 standardscaler, minmaxscaler, 단 cut 범주형으로 존재

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X = df.iloc[:, :9]

y = df.iloc[:,[9]]

s_scaler = StandardScaler()
s_scaler = s_scaler.fit(X)
ss_df = s_scaler.transform(X)


m_scaler = MinMaxScaler()
m_scaler = m_scaler.fit(ss_df)
sm_df = m_scaler.transform(ss_df)
X = pd.DataFrame(data=sm_df, columns=['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'e_color', 'e_clarity'])

X

Unnamed: 0,carat,depth,table,price,x,y,z,e_color,e_clarity
0,0.006237,0.513889,0.230769,0.000000,0.367784,0.067572,0.076415,0.166667,0.428571
1,0.002079,0.466667,0.346154,0.000000,0.362197,0.065195,0.072642,0.166667,0.285714
2,0.006237,0.386111,0.423077,0.000054,0.377095,0.069100,0.072642,0.166667,0.571429
3,0.018711,0.538889,0.288462,0.000433,0.391061,0.071817,0.082704,0.833333,0.714286
4,0.022869,0.563889,0.288462,0.000487,0.404097,0.073854,0.086478,1.000000,0.428571
...,...,...,...,...,...,...,...,...,...
53935,0.108108,0.494444,0.269231,0.131427,0.535382,0.097793,0.110063,0.000000,0.285714
53936,0.108108,0.558333,0.230769,0.131427,0.529795,0.097623,0.113522,0.000000,0.285714
53937,0.103950,0.550000,0.326923,0.131427,0.527002,0.096435,0.111950,0.000000,0.285714
53938,0.137214,0.500000,0.288462,0.131427,0.572626,0.103905,0.117610,0.666667,0.428571


## train_test_split만 쓸때의 모델 성능

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=500)

ds_model = DecisionTreeClassifier()
ds_model = ds_model.fit(X_train, y_train)
yhat = ds_model.predict(X_test)
acc = accuracy_score(yhat,y_test)

print('모델 적확도',np.round(acc,4))

모델 적확도 0.7109


## cross_validation만 쓸때의 모델 성능

In [10]:
from sklearn.model_selection import cross_val_score

ds_model = DecisionTreeClassifier(random_state=500)

scores = cross_val_score(ds_model, X,y, scoring='accuracy', cv=5)

print('교차 검증별 정확도: ', np.round(scores, 4))
print('평균 교차 검증별 정확도: ', np.round(np.mean(scores),4))

교차 검증별 정확도:  [0.4056 0.2138 0.1961 0.3651 0.5991]
평균 교차 검증별 정확도:  0.356


* standardscaler, minmaxscaler했을때의 정확도
* 모델 적확도 0.7082
* 평균 교차 검증별 정확도:  0.356

## standardscaler, minmaxscaler적용 안하고 모델 성능 2차 검증

In [11]:
df = sns.load_dataset('diamonds')
df.head()

L_encoder = LabelEncoder()
for i in df[['cut', 'color', 'clarity']]:
    if i == 'cut':
        L_encoder.fit(df[i])
        label_cut = L_encoder.transform(df[i])
        
    elif i == 'color':
        L_encoder.fit(df[i])
        label_color = L_encoder.transform(df[i])
        
    else:
        L_encoder.fit(df[i])
        label_clarity = L_encoder.transform(df[i])
        
d={'e_color':label_color, 'e_clarity': label_clarity,'e_cut': label_cut,}
encoded_df = pd.DataFrame(data=d)

df = df.drop(columns=['cut', 'color', 'clarity'], axis=1)

df = pd.concat([df,encoded_df], axis=1)

X = df.iloc[:, :9]

y = df.iloc[:,[9]]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

ds_model = DecisionTreeClassifier()
ds_model = ds_model.fit(X_train, y_train)
yhat = ds_model.predict(X_test)
acc = accuracy_score(yhat,y_test)

print('모델 적확도',np.round(acc,4))

모델 적확도 0.7147


In [12]:
ds_model = DecisionTreeClassifier()

scores = cross_val_score(ds_model, X,y, scoring='accuracy', cv=5)

print('교차 검증별 정확도: ', np.round(scores, 4))
print('평균 교차 검증별 정확도: ', np.round(np.mean(scores),4))

교차 검증별 정확도:  [0.4233 0.2175 0.2015 0.3688 0.6012]
평균 교차 검증별 정확도:  0.3625


## label_encoding 0에서 1로 바꾼 3차 검증

In [13]:
from sklearn.preprocessing import LabelEncoder
df = sns.load_dataset('diamonds')

L_encoder = LabelEncoder()
for i in df[['cut', 'color', 'clarity']]:
    if i == 'cut':
        L_encoder.fit(df[i])
        label_cut = L_encoder.transform(df[i])
        
    elif i == 'color':
        L_encoder.fit(df[i])
        label_color = L_encoder.transform(df[i])
        
    else:
        L_encoder.fit(df[i])
        label_clarity = L_encoder.transform(df[i])
        
print(label_cut)
print(label_color)
print(label_clarity)

d={'e_color':label_color, 'e_clarity': label_clarity,'e_cut': label_cut,}
encoded_df = pd.DataFrame(data=d)
encoded_df

[2 3 1 ... 4 3 2]
[1 1 1 ... 0 4 0]
[3 2 4 ... 2 3 3]


Unnamed: 0,e_color,e_clarity,e_cut
0,1,3,2
1,1,2,3
2,1,4,1
3,5,5,3
4,6,3,1
...,...,...,...
53935,0,2,2
53936,0,2,1
53937,0,2,4
53938,4,3,3


In [14]:
encoded_df = encoded_df+1

In [15]:
df = df.drop(columns=['cut', 'color', 'clarity'], axis=1)

df = pd.concat([df,encoded_df], axis=1)

X = df.iloc[:, :9]

y = df.iloc[:,[9]]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

ds_model = DecisionTreeClassifier()
ds_model = ds_model.fit(X_train, y_train)
yhat = ds_model.predict(X_test)
acc = accuracy_score(yhat,y_test)

print('모델 적확도',np.round(acc,4))

모델 적확도 0.715


In [16]:
ds_model = DecisionTreeClassifier()

scores = cross_val_score(ds_model, X,y, scoring='accuracy', cv=5)

print('교차 검증별 정확도: ', np.round(scores, 4))
print('평균 교차 검증별 정확도: ', np.round(np.mean(scores),4))

교차 검증별 정확도:  [0.4229 0.2158 0.2062 0.3697 0.6115]
평균 교차 검증별 정확도:  0.3652


## GridSearchCV 4차 검증

In [17]:
from sklearn.model_selection import GridSearchCV

df = sns.load_dataset('diamonds')
df.head()

L_encoder = LabelEncoder()
for i in df[['cut', 'color', 'clarity']]:
    if i == 'cut':
        L_encoder.fit(df[i])
        label_cut = L_encoder.transform(df[i])
        
    elif i == 'color':
        L_encoder.fit(df[i])
        label_color = L_encoder.transform(df[i])
        
    else:
        L_encoder.fit(df[i])
        label_clarity = L_encoder.transform(df[i])
        
d={'e_color':label_color, 'e_clarity': label_clarity,'e_cut': label_cut,}
encoded_df = pd.DataFrame(data=d)

df = df.drop(columns=['cut', 'color', 'clarity'], axis=1)

df = pd.concat([df,encoded_df], axis=1)

X = df.iloc[:, :9]

y = df.iloc[:,[9]]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

ds_model = DecisionTreeClassifier()

parameters = {'max_depth': [1,2,3], 'min_samples_split':[2,3]}

grid_ds_model = GridSearchCV(ds_model, param_grid= parameters, cv=3, refit=True)

grid_ds_model.fit(X_train,y_train)

scores_df = pd.DataFrame(grid_ds_model.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.601015,5,0.599555,0.601988,0.601502
1,"{'max_depth': 1, 'min_samples_split': 3}",0.601015,5,0.599555,0.601988,0.601502
2,"{'max_depth': 2, 'min_samples_split': 2}",0.657073,3,0.657675,0.658023,0.65552
3,"{'max_depth': 2, 'min_samples_split': 3}",0.657073,3,0.657675,0.658023,0.65552
4,"{'max_depth': 3, 'min_samples_split': 2}",0.686643,1,0.688821,0.687083,0.684024
5,"{'max_depth': 3, 'min_samples_split': 3}",0.686643,1,0.688821,0.687083,0.684024


In [18]:
print('GridSearchCV 최적의 파라미터: ', grid_ds_model.best_params_)
print('GridSearchCV 최적의 정확도: {0:.4f}' .format(grid_ds_model.best_score_))

GridSearchCV 최적의 파라미터:  {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최적의 정확도: 0.6866


In [19]:
estimator = grid_ds_model.best_estimator_
yhat = estimator.predict(X_test)
print('GridSearchCV 최적의 정확도: {0:.4f}' .format(accuracy_score(y_test,yhat)))

GridSearchCV 최적의 정확도: 0.6890


## Random Forest classifier model

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
for i in range(2,11):
     c_score = cross_val_score(rf_model, X_train, y_train, 
                               scoring=['accuracy', 'precision','recall','f1','roc_auc'],
                               cv=i)
     print(f'CV={i}, accuracy={np.mean(accuracy)}, precision={np.mean(precision)}, recall={np.mean(recall)}, f1={np.mean(f1)}, roc_auc={np.mean(roc_auc)}')


InvalidParameterError: The 'scoring' parameter of cross_val_score must be a str among {'neg_negative_likelihood_ratio', 'f1_micro', 'roc_auc_ovo_weighted', 'adjusted_rand_score', 'f1_macro', 'homogeneity_score', 'neg_mean_absolute_percentage_error', 'recall_macro', 'roc_auc', 'precision_weighted', 'recall', 'explained_variance', 'jaccard_samples', 'neg_mean_poisson_deviance', 'neg_root_mean_squared_error', 'rand_score', 'roc_auc_ovr_weighted', 'neg_mean_gamma_deviance', 'recall_samples', 'neg_mean_absolute_error', 'roc_auc_ovo', 'adjusted_mutual_info_score', 'balanced_accuracy', 'jaccard', 'neg_log_loss', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'recall_micro', 'average_precision', 'f1', 'r2', 'precision_samples', 'jaccard_macro', 'precision_macro', 'jaccard_weighted', 'roc_auc_ovr', 'precision', 'accuracy', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'f1_samples', 'neg_brier_score', 'precision_micro', 'completeness_score', 'f1_weighted', 'positive_likelihood_ratio', 'top_k_accuracy', 'jaccard_micro', 'matthews_corrcoef', 'fowlkes_mallows_score', 'recall_weighted', 'v_measure_score', 'neg_root_mean_squared_log_error', 'max_error', 'mutual_info_score'}, a callable or None. Got ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'] instead.

## 모델 평가