In [23]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

## 1. Data 가져오기

In [5]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")
sample_submission = pd.read_csv('SampleSubmission.csv')

## 2. Data 전처리

In [11]:
# 입력값과 예측 목표값 분리
train_inputs = train_df.drop(columns=['target'])
train_targets = train_df['target']
test_inputs = test_df.copy()

# feature 선정
dropped_columns = ['device', 'id']
train_inputs.drop(columns=dropped_columns, inplace=True)
test_inputs.drop(columns=dropped_columns, inplace=True)

#feature 카테고리 변환(feature 'area' 각각의 고유한 값에 다른 정수할당)
categorical_features = ['area']
oe = OrdinalEncoder()
train_inputs[categorical_features] = oe.fit_transform(train_inputs[categorical_features])
test_inputs[categorical_features] = oe.transform(test_inputs[categorical_features])

# 결측치 0으로 변환
train_inputs.fillna(0, inplace=True)
test_inputs.fillna(0, inplace=True)

## 3. 예측 알고리즘(RandomForestRegressor)

In [17]:
# training data 와 test data 분리

X_train, X_test, y_train, y_test = train_test_split(train_inputs, train_targets, test_size = 0.2, random_state = 0)

# ML알고리즘 생성
params = {
    'n_estimators' :(20,40),
    'max_depth' : (5,10),
    'min_samples_leaf' : (5, 18),
    'min_samples_split' : (5,16)}

rf = RandomForestRegressor(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf, param_grid=params, cv=2, n_jobs=-1)

# 학습
grid_cv.fit(X_train, y_train)

# 최적의 하이퍼파라미터 및 정확도
print('최적 하이퍼파라미터 :', grid_cv.best_params_)
print('최적 예측 정확도 : {0:.4f}'.format(grid_cv.best_score_))



최적 하이퍼파라미터 : {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 40}
최적 예측 정확도 : 0.9285


In [19]:
# 모델학습
rf_run = RandomForestRegressor(random_state=0, max_depth=10, min_samples_leaf=5, min_samples_split=5, n_estimators=40)
rf_run.fit(X_train, y_train)

# 평가
val_predictions = rf_run.predict(X_test)

# 테스트
test_predictions = rf_run.predict(test_inputs)

## 4. 오차 평가(RMSE)

In [20]:
rmse = mean_squared_error(y_test, val_predictions, squared=False)
print(f"Root Mean Squared Error = {rmse / 1e6:.3} Mbit/s")

Root Mean Squared Error = 10.8 Mbit/s


## 5. 결과 저장

In [21]:
# Add index to results
predictions_df = pd.DataFrame({'id': test_df.id, 'target': test_predictions})
predictions_df.to_csv("BenchmarkSubmission.csv", index = False)
predictions_df.head()

Unnamed: 0,id,target
0,Id_ln0e0hfrgx,23728140.0
1,Id_svf7nz9fxv,68713900.0
2,Id_ww2mh07gwj,81260490.0
3,Id_v88r4y03ww,21441020.0
4,Id_2u4y4kzglh,3686830.0


## 3. 예측 알고리즘(GradientBoostingRegressor)

In [24]:
# training data 와 test data 분리
X_train, X_test, y_train, y_test = train_test_split(train_inputs, train_targets, test_size = 0.2, random_state = 0)

# ML알고리즘 생성

GB = GradientBoostingRegressor(n_estimators=3)

# 학습
GB.fit(X_train, y_train)

# 평가
val_predictions = GB.predict(X_test)

# 테스트
test_predictions = GB.predict(test_inputs)

## 4. 오차 평가(RMSE)

In [25]:
rmse = mean_squared_error(y_test, val_predictions, squared=False)
print(f"Root Mean Squared Error = {rmse / 1e6:.3} Mbit/s")

Root Mean Squared Error = 34.9 Mbit/s


## 5. 결과 저장

In [26]:
# Add index to results
predictions_df = pd.DataFrame({'id': test_df.id, 'target': test_predictions})
predictions_df.to_csv("BenchmarkSubmission.csv", index = False)
predictions_df.head()

Unnamed: 0,id,target
0,Id_ln0e0hfrgx,49919690.0
1,Id_svf7nz9fxv,61865750.0
2,Id_ww2mh07gwj,61865750.0
3,Id_v88r4y03ww,46353430.0
4,Id_2u4y4kzglh,46353430.0
