In [1]:
import numpy as np
import pandas as pd

In [2]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
submission=pd.read_csv("sampleSubmission.csv")

## 피처 엔지니어링

### 이상치 제거

In [3]:
train=train[train['weather']!=4]

### 데이터 합치기

훈련 데이터와 테스트 데이터에 같은 피처 엔지니어링을 적용하기 위해 두 데이터를 합쳐서 작업

In [4]:
all_data=pd.concat([train,test],ignore_index=True)

In [5]:
all_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1.0


In [6]:
all_data.tail() 

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,
17377,2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,,,


ignore_index를 True하면 원래 데이터의 인덱스를 무시하고 이어붙임  
False이면 0-10885까지 하고 다시 0-6492를 함

### EDA에서 한 피처 엔지니어링을 함

In [7]:
from datetime import datetime

In [8]:
all_data['date']=all_data['datetime'].apply(lambda x: x.split()[0])
all_data['year']=all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
all_data['month']=all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
all_data['hour']=all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
all_data['weekday']=all_data['date'].apply(lambda dateString:
                                    datetime.strptime(dateString,"%Y-%m-%d").weekday())

### 필요 없는 피처 제거

In [9]:
drop_feature=['casual','registered','datetime','date','windspeed','month']

In [10]:
all_data=all_data.drop(drop_feature,axis=1)

## 데이터 나누기

In [11]:
x_train=all_data[~pd.isnull(all_data['count'])]
x_test=all_data[pd.isnull(all_data['count'])]

In [12]:
x_train=x_train.drop(['count'],axis=1)
x_test=x_test.drop(['count'],axis=1)

In [13]:
y=train['count']

# 평가지표 함수(RMSLE)

In [14]:
def rmsle(y_true,y_pred,converEXP=True):
    if converEXP:
        y_true=np.expm1(y_true)
        y_pred=np.expm1(y_pred)
    log_true=np.nan_to_num(np.log1p(y_true))
    log_pred=np.nan_to_num(np.log1p(y_pred))
    output=np.sqrt(np.mean((log_true-log_pred)**2))
    return output

$\sqrt(\frac{1}{N}\sum(log(y_i +1)-log(\hat y_i +1))^2)$

# 모델 훈련&성능 검증

In [15]:
from sklearn.linear_model import LinearRegression
linear=LinearRegression()

In [16]:
log_y=np.log1p(y)

In [17]:
linear.fit(x_train,log_y)#모델 훈련

LinearRegression()

In [18]:
preds=linear.predict(x_train)#y 예측

In [19]:
print(f'선형 회귀의 RMSLE 값:{rmsle(log_y,preds,True):.4f}')

선형 회귀의 RMSLE 값:1.0183


지금은 과정을 보여주려고 검증할 때 훈련데이터를 사용했는데 원래는 검증 데이터를 사용해야 함

# 예측 및 결과 제출

In [20]:
linear_preds=linear.predict(x_test)
submission['count']=np.expm1(linear_preds)
submission.to_csv('linear.csv',index=False)