In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/playground_s3e8/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/playground_s3e8/test.csv')
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/playground_s3e8/sample_submission.csv')

In [None]:
train.shape, test.shape

((193573, 11), (129050, 10))

### 정보
### carat: 캐럿 / cut: 품질(premium, Very Good, Good, Fair) / color: 색(D -> best/ J -> worst) / clarity: 명료함
### 

In [None]:
train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [None]:
train['cut'].value_counts()

Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: cut, dtype: int64

In [None]:
train['depth'].value_counts()

61.9    10781
62.0    10150
61.8     9270
62.1     8866
61.6     8534
        ...  
69.4        1
68.1        1
52.1        1
55.8        1
55.0        1
Name: depth, Length: 153, dtype: int64

In [None]:
# price값이 빠짐
test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54,2.82
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87,3.68
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74,3.55
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42,2.73
4,193577,1.2,Very Good,I,VS2,62.7,56.0,6.75,6.79,4.24


In [None]:
# 가격 예측하기
submission.head()

Unnamed: 0,id,price
0,193573,3969.155
1,193574,3969.155
2,193575,3969.155
3,193576,3969.155
4,193577,3969.155


In [None]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,193573.0,96786.0,55879.856166,0.0,48393.0,96786.0,145179.0,193572.0
carat,193573.0,0.790688,0.462688,0.2,0.4,0.7,1.03,3.5
depth,193573.0,61.820574,1.081704,52.1,61.3,61.9,62.4,71.6
table,193573.0,57.227675,1.918844,49.0,56.0,57.0,58.0,79.0
x,193573.0,5.715312,1.109422,0.0,4.7,5.7,6.51,9.65
y,193573.0,5.720094,1.102333,0.0,4.71,5.72,6.51,10.01
z,193573.0,3.534246,0.688922,0.0,2.9,3.53,4.03,31.3
price,193573.0,3969.155414,4034.374138,326.0,951.0,2401.0,5408.0,18818.0


In [None]:
# 결측치 없음 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [None]:
y_train = train['price']
# 관계 없는 id와 타깃값 price 제거
X_train = train.drop(columns=['id','price'], axis=1)

In [None]:
y_train

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [None]:
X_train['cut']=X_train['cut'].replace(['Ideal','Premium','Very Good','Good','Fair'],
                       [5,4,3,2,1])
test['cut']=test['cut'].replace(['Ideal','Premium','Very Good','Good','Fair'],
                       [5,4,3,2,1])

In [None]:
X_train['color']=X_train['color'].replace(['D','E','F','G','H','I','J'],
                       [7,6,5,4,3,2,1])
test['color']=test['color'].replace(['D','E','F','G','H','I','J'],
                       [7,6,5,4,3,2,1])

In [None]:
X_train['clarity'].value_counts(normalize=True)

SI1     0.275204
VS2     0.248108
VS1     0.158436
SI2     0.157481
VVS2    0.081427
VVS1    0.054904
IF      0.021795
I1      0.002645
Name: clarity, dtype: float64

In [None]:
X_train['clarity']=X_train['clarity'].replace(['FL', 'IF', 'VVS1', 'VVS2',
                                               'VS1', 'VS2', 'SI1', 'SI2', 'I1', 'I2', 'I3'],
                       [11,10,9,8,7,6,5,4,3,2,1])
test['clarity']=test['clarity'].replace(['FL', 'IF', 'VVS1', 'VVS2',
                                               'VS1', 'VS2', 'SI1', 'SI2', 'I1', 'I2', 'I3'],
                       [11,10,9,8,7,6,5,4,3,2,1])

In [None]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,4,5,6,62.2,58.0,7.27,7.33,4.55
1,2.03,3,1,4,62.0,58.0,8.06,8.12,5.05
2,0.70,5,4,7,61.2,57.0,5.69,5.73,3.50
3,0.32,5,4,7,61.6,56.0,4.38,4.41,2.71
4,1.70,4,4,6,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,5,7,8,61.1,56.0,4.35,4.39,2.67
193569,0.70,4,4,8,60.3,58.0,5.75,5.77,3.47
193570,0.73,3,5,5,63.1,57.0,5.72,5.75,3.62
193571,0.34,3,7,5,62.9,55.0,4.45,4.49,2.81


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_train,
#                                                     y_train,
#                                                     test_size=0.2,
#                                                     random_state=20)

In [None]:
# 교차검증
def RandTree_grid_search(X, y, kfolds=5):
  rf_params = {
      'n_estimators':[50, 80, 100],
      'max_depth':[3, 5, 7, 9],
      'min_samples_leaf':[1,3,5,8],
      'max_features':['auto']
  }
  dtmodel = RandomForestClassifier()
  
  # 교차 검증
  dt_gscv = GridSearchCV(dtmodel, rf_params, cv=kfolds)
  dt_gscv.fit(X, y)
  return dt_gscv.best_params_, dt_gscv.best_score_

In [None]:
bparam = RandTree_grid_search(X_train, test, 3)

ValueError: ignored

In [None]:
print('최적화: ', bparam[0])
print('예측 성능: ', bparam[1])

In [None]:
# xgboost
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {
    'max_depth':3,                   # 트리 depth
    'eta':0.1,                       # 학습률
    'objective':'multi:softmax',     # 목적 함수
    'eval_metric':'mlogloss',        # 오류(손실) 함수
    'num_class':3,
    'early_stoppings':100            # 시행시 오류가 줄지않으면 멈춤
}

In [None]:
wlist = [(dtrain, 'train'), (dtest, 'eval')]


In [None]:
xgb_model = xgb.train(params=params,
                      dtrain=dtrain,
                      num_boost_round=1000,
                      evals=wlist)