In [3]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sampleSubmission.csv')

import calendar
from datetime import datetime

def clear_data(data, train_tf) :
    '''datetiem column'''
    # Convert the 'datetime' column to datetime format
    data['datetime'] = pd.to_datetime(data['datetime'])

    # Create new columns for year, month, day, hour, minute, and second
    # 분/초는 모두 0이므로 제외
    data['year'] = data['datetime'].dt.year
    data['month'] = data['datetime'].dt.month
    data['day'] = data['datetime'].dt.day
    data['hour'] = data['datetime'].dt.hour


    # Create a new 'date' column that combines 'year', 'month', and 'day'
    data['date'] = pd.to_datetime(data[['year', 'month', 'day']])

    # Create a new 'weekday' column
    data['weekday'] = data['date'].dt.dayofweek.apply(lambda x: calendar.day_name[x])

    # Drop the 'date' column as it's no longer needed
    data = data.drop('date', axis=1)

    # Check the first few rows of the dataframe to confirm the changes
    # data.head()


    # Drop the 'datetime' column
    data = data.drop('datetime', axis=1)

    # Check the first few rows of the dataframe to confirm the changes
    data.head()


    '''weekday'''
    mapping_data = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    data['weekday'] = data['weekday'].map(mapping_data)

    '''weather'''
    if train_tf == 1:   # train 데이터일 경우에만 적용
        # 3번과 4번 합치기
        # data.loc[data['weather'] == 4, 'weather'] = 3

        # 4번 삭제
        data = data.drop(index=data[data['weather']==4].index)


    return data

train = clear_data(train, 1)
test = clear_data(test, 0)

### model

In [4]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [5]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from catboost import Pool
from catboost import CatBoostRegressor
import numpy as np

In [6]:
# 데이터 분할
X_train = train.drop(['day', 'windspeed', 'casual', 'registered', 'count'], axis=1)
y_train = train['count']

X_test = test.drop(['day', 'windspeed'], axis=1)

In [7]:
np.random.seed(42)

In [9]:
def rmsle(y_true, y_pred, convertExp=True):
    # 지수변환
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)

    # 로그변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))

    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

### 랜덤포레스트

In [10]:
RF = RandomForestRegressor()

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 8, 10, 12],
}

grid_search = GridSearchCV(RF, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 12, 'n_estimators': 100}
Best Score: -5447.43203345083


In [11]:
RF = RandomForestRegressor(max_depth=12, n_estimators=200)
log_y = np.log(y_train)  # 타깃값 로그변환
RF.fit(X_train, log_y) # 모델 훈련
preds = RF.predict(X_test)

In [12]:
submission['count'] = np.exp(preds)    # 지수변환
submission.to_csv('RF_day_ver.csv', index=False) # 파일로 저장

### catboost (가장 좋은 결과)

In [13]:
train_data = Pool(X_train, label=np.log(y_train))

In [14]:
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6)

In [15]:
model.fit(train_data)

0:	learn: 1.3831587	total: 57ms	remaining: 57s
1:	learn: 1.2920222	total: 61.3ms	remaining: 30.6s
2:	learn: 1.2144572	total: 69.8ms	remaining: 23.2s
3:	learn: 1.1436357	total: 75.8ms	remaining: 18.9s
4:	learn: 1.0730003	total: 82ms	remaining: 16.3s
5:	learn: 1.0063227	total: 86.4ms	remaining: 14.3s
6:	learn: 0.9482292	total: 93ms	remaining: 13.2s
7:	learn: 0.8976789	total: 99.8ms	remaining: 12.4s
8:	learn: 0.8510357	total: 106ms	remaining: 11.7s
9:	learn: 0.8102401	total: 114ms	remaining: 11.2s
10:	learn: 0.7771554	total: 121ms	remaining: 10.9s
11:	learn: 0.7473562	total: 127ms	remaining: 10.5s
12:	learn: 0.7244830	total: 135ms	remaining: 10.2s
13:	learn: 0.7012077	total: 138ms	remaining: 9.74s
14:	learn: 0.6833339	total: 145ms	remaining: 9.53s
15:	learn: 0.6535153	total: 151ms	remaining: 9.29s
16:	learn: 0.6360915	total: 157ms	remaining: 9.09s
17:	learn: 0.6122674	total: 163ms	remaining: 8.9s
18:	learn: 0.5856030	total: 172ms	remaining: 8.86s
19:	learn: 0.5733066	total: 178ms	remainin

<catboost.core.CatBoostRegressor at 0x7f3bc48f2590>

In [16]:
y_pred = model.predict(X_test)

In [17]:
submission['count'] = np.exp(y_pred)    # 지수변환
submission.to_csv('catboost_ver_2.csv', index=False) # 파일로 저장