## 캐글 자전거 수요 예측 : https://www.kaggle.com/competitions/bike-sharing-demand/overview
### 목표 : 전처리 방법 변경 및 모델을 Tensorflow 딥러닝 모델로 변경하여 제출 후 스코어 0.8 이하 도달하기

### baseline

In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# datetime 컬럼을 datetime 타입으로 변환
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

# datetime에서 유용한 특성 추출
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['dayofweek'] = train['datetime'].dt.dayofweek

test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['dayofweek'] = test['datetime'].dt.dayofweek

# 사용할 특성 선택
features = ['season', 'holiday', 'workingday', 'weather', 'temp',
           'atemp', 'humidity', 'windspeed', 'year', 'month',
           'day', 'hour', 'dayofweek']

X = train[features]
y = train['count']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 검증 데이터로 성능 평가
val_pred = rf_model.predict(X_val)
print('검증 데이터 RMSE:', np.sqrt(mean_squared_error(y_val, val_pred)))
print('검증 데이터 MAE:', mean_absolute_error(y_val, val_pred))

# 테스트 데이터 예측
X_test = test[features]
test_pred = rf_model.predict(X_test)

# 제출 파일 생성
submission = pd.DataFrame({
    'datetime': test['datetime'],
    'count': test_pred
})
submission.to_csv('submission.csv', index=False)

print('제출 파일이 생성되었습니다.')


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Bike prediction/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Bike prediction/test.csv')

# datetime 컬럼을 datetime 타입으로 변환
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

# datetime에서 유용한 특성 추출
for df in [train, test]:
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['is_rush_hour'] = df['hour'].isin([7, 8, 17, 18]).astype(int)

# 원-핫 인코딩
categorical_features = ['season', 'weather']
train = pd.get_dummies(train, columns=categorical_features)
test = pd.get_dummies(test, columns=categorical_features)

# 사용할 특성 선택 (원-핫 인코딩 후 업데이트된 컬럼 이름 사용)
features = ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
            'year', 'month', 'day', 'hour', 'dayofweek', 'is_weekend', 'is_rush_hour']
features += [col for col in train.columns if col.startswith(('season_', 'weather_'))]

# 특성과 타겟 분리
X = train[features]
y = train['count']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# TensorFlow 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

# 모델 컴파일
model.compile(optimizer='adam', loss='mse')

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=200, batch_size=32, verbose=1,
                    callbacks=[early_stopping])

# 검증 데이터로 성능 평가
val_pred = model.predict(X_val_scaled).flatten()
print('검증 데이터 RMSE:', np.sqrt(mean_squared_error(y_val, val_pred)))
print('검증 데이터 MAE:', mean_absolute_error(y_val, val_pred))

# 테스트 데이터 예측
X_test = test[features]
X_test_scaled = scaler.transform(X_test)
test_pred = model.predict(X_test_scaled).flatten()

# 제출 파일 생성
submission = pd.DataFrame({
    'datetime': test['datetime'],
    'count': test_pred
})
submission.to_csv('submission3.csv', index=False)

print('제출 파일이 생성되었습니다.')

Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 53705.0156 - val_loss: 13176.6562
Epoch 2/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 12557.9229 - val_loss: 11581.4072
Epoch 3/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 11268.7168 - val_loss: 10432.1201
Epoch 4/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9857.2334 - val_loss: 9836.0020
Epoch 5/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9792.7256 - val_loss: 9359.7754
Epoch 6/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9082.3047 - val_loss: 8942.8564
Epoch 7/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8584.5469 - val_loss: 8659.2051
Epoch 8/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8150.5645 - val_loss: 

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Bike prediction/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Bike prediction/test.csv')

# datetime 컬럼을 datetime 타입으로 변환
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

# datetime에서 유용한 특성 추출
for df in [train, test]:
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek

    # 시간대 특성 추가
    df['is_rush_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)) | ((df['hour'] >= 16) & (df['hour'] <= 19))

    # 계절 특성을 원-핫 인코딩
    df = pd.get_dummies(df, columns=['season'], prefix='season')

    # 날씨 특성을 원-핫 인코딩
    df = pd.get_dummies(df, columns=['weather'], prefix='weather')

# 사용할 특성 선택
features = ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed',
            'year', 'month', 'day', 'hour', 'dayofweek', 'is_rush_hour'] + \
           [col for col in train.columns if col.startswith('season_') or col.startswith('weather_')]

X = train[features]
y = train['count']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 신경망 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mean_squared_error')

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,
                    batch_size=32,
                    callbacks=[early_stopping],
                    verbose=1)

# 검증 데이터로 성능 평가
val_pred = model.predict(X_val_scaled)
print('검증 데이터 RMSE:', np.sqrt(mean_squared_error(y_val, val_pred)))
print('검증 데이터 MAE:', mean_absolute_error(y_val, val_pred))

# 테스트 데이터 예측
X_test = test[features]
X_test_scaled = scaler.transform(X_test)
test_pred = model.predict(X_test_scaled)

# 제출 파일 생성
submission = pd.DataFrame({
    'datetime': test['datetime'],
    'count': test_pred.flatten()
})
submission.to_csv('submission4.csv', index=False)

print('제출 파일이 생성되었습니다.')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 67787.7734 - val_loss: 49768.4531
Epoch 2/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 36842.0273 - val_loss: 12036.9912
Epoch 3/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 12814.5020 - val_loss: 7458.8481
Epoch 4/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 10104.4883 - val_loss: 6767.0146
Epoch 5/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9922.2969 - val_loss: 6426.6045
Epoch 6/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8813.7852 - val_loss: 6096.2109
Epoch 7/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9315.8291 - val_loss: 5752.5986
Epoch 8/100
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8377.5479 

In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Bike prediction/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Bike prediction/test.csv')