In [1]:
import pandas as pd
import numpy as np 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns 
import os 
from scipy import stats 

plt.style.use('seaborn') 

import warnings 
warnings.filterwarnings("ignore") 

mpl.rcParams['axes.unicode_minus'] = False 

%matplotlib inline

In [2]:
df_train = pd.read_csv("train.csv", parse_dates = ["datetime"]) 
df_test = pd.read_csv("test.csv", parse_dates = ["datetime"])

In [3]:
df_train["year"] = df_train["datetime"].dt.year
df_train["month"] = df_train["datetime"].dt.month
df_train["day"] = df_train["datetime"].dt.day
df_train["hour"] = df_train["datetime"].dt.hour

df_test["year"] = df_test["datetime"].dt.year 
df_test["month"] = df_test["datetime"].dt.month 
df_test["day"] = df_test["datetime"].dt.day
df_test["hour"] = df_test["datetime"].dt.hour

df_train["dayofweek"] = df_train["datetime"].dt.dayofweek
df_test["dayofweek"] = df_test["datetime"].dt.dayofweek
#요일 변수 추가생성

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [5]:
df_train.loc[df_train["windspeed"] == 0, "windspeed"] = df_train["windspeed"].mean()
df_test.loc[df_train["windspeed"] == 0, "windspeed"] = df_train["windspeed"].mean()
#windspeed 0의 값을 평균값으로 넣어줌

In [6]:
df_train[(df_train['humidity']==0)|(df_train['humidity']==100)] = df_train['humidity'].mean()
# humidity 0과 100 값 대신 평균값을 넣어줌

In [7]:
feature_names = ["season", "weather", 'temp', 'atemp', 'humidity', 'windspeed',
                 "year", 'hour', "dayofweek", "holiday", "workingday"]

feature_names

['season',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'year',
 'hour',
 'dayofweek',
 'holiday',
 'workingday']

In [8]:
X_train = df_train[feature_names]

print(X_train.shape)
X_train.head()

(10886, 11)


Unnamed: 0,season,weather,temp,atemp,humidity,windspeed,year,hour,dayofweek,holiday,workingday
0,1.0,1.0,9.84,14.395,81.0,12.799395,2011.0,0.0,5.0,0.0,0.0
1,1.0,1.0,9.02,13.635,80.0,12.799395,2011.0,1.0,5.0,0.0,0.0
2,1.0,1.0,9.02,13.635,80.0,12.799395,2011.0,2.0,5.0,0.0,0.0
3,1.0,1.0,9.84,14.395,75.0,12.799395,2011.0,3.0,5.0,0.0,0.0
4,1.0,1.0,9.84,14.395,75.0,12.799395,2011.0,4.0,5.0,0.0,0.0


In [9]:
X_test = df_test[feature_names]

print(X_test.shape)
X_test.head()

(6493, 11)


Unnamed: 0,season,weather,temp,atemp,humidity,windspeed,year,hour,dayofweek,holiday,workingday
0,1,1,10.66,11.365,56,26.0027,2011,0,3,0,1
1,1,1,10.66,13.635,56,0.0,2011,1,3,0,1
2,1,1,10.66,13.635,56,0.0,2011,2,3,0,1
3,1,1,10.66,12.88,56,11.0014,2011,3,3,0,1
4,1,1,10.66,12.88,56,11.0014,2011,4,3,0,1


In [10]:
y_train = df_train['count']

print(y_train.shape)
y_train.head()

(10886,)


0    16.0
1    40.0
2    32.0
3    13.0
4     1.0
Name: count, dtype: float64

In [11]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
prediction_drop_humidity = model.predict(X_test)

print(prediction_drop_humidity.shape)
prediction_drop_humidity[0:20]

(6493,)


array([ 11.67,   6.09,   4.71,   3.3 ,   3.01,   7.81,  37.74, 104.79,
       239.15, 137.47,  64.53,  63.8 ,  77.69,  75.63,  83.82,  87.31,
        97.55, 207.64, 179.71, 101.57])

In [12]:
submission = pd.read_csv("sampleSubmission.csv")
submission

submission["count"] = prediction_drop_humidity

print(submission.shape)
submission.head()
submission.to_csv('sampleSubmission.csv', index=False)

(6493, 2)


In [13]:
submission.head()
#score 0.423 상위 15%

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,11.67
1,2011-01-20 01:00:00,6.09
2,2011-01-20 02:00:00,4.71
3,2011-01-20 03:00:00,3.3
4,2011-01-20 04:00:00,3.01
