In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sampleSubmission.csv')

### 이상치 제거
값에서 벗어난 아주 작은 값 또는 큰 값을 의미

In [2]:
# 훈련 데이터에서 weather가 4가 아닌 데이터만 추출
train = train[train['weather'] != 4]

### 데이터 합치기

## 10886 + 6492 -> 17378rows -> 즉, 겹칠때 index 무시를 하지 않음!

In [3]:
all_data_temp = pd.concat([train, test])
all_data_temp

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


# 겹치기 수정 (ignore_index=True)-> 뒤에 바로 붙음

In [4]:
all_data = pd.concat([train, test],ignore_index=True)
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


## 파생 변수(피처) 추가

In [5]:
from datetime import datetime

In [6]:
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
all_data['weekday'] = all_data['date'].apply(lambda dateString : datetime.strptime(dateString, "%Y-%m-%d").weekday())


### 다른 방법
all_data['datatime']= pd.to_datetime(all_data['datetime'])

all_data['year'] = all_data['atetime'].dt.year # 연도
all_data['month'] = all_data['atetime'].dt.month # 월
all_data['hour'] = all_data['atetime'].dt.hour # 시간
all_data['weekday'] = all_data['atetime'].dt.weekday #요일

### 불필요한 피처 제거

In [7]:
# 'casual', 'registered' => 테스트 데이터에 없음, 'datetime', 'date'=> (year, month, day)에 담김,
# 'month' => season 피처와 겹침, 'windspeed' => 타깃값과 상관관계가 약함
drop_features = ['casual', 'registered', 'datetime', 'date', 'month', 'windspeed']

all_data = all_data.drop(drop_features, axis=1)

# 피처 선택!
## 모델링 시 데이터의 특징을 잘  나타내는 주요 피처만 선택하는 작업(feature selection)
## 타깃값 예측과 관련 없는 피처가 많다면 오히려 예측 성능이 떨어짐. 많다고 좋은게 아님.
## 타깃과 관련값이 필요!

### 데이터 나누기

In [8]:
# 훈련 데이터와 테스트 데이터 나누기
# ~은 not을 의미함
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

In [9]:
# 타깃값 count 제거
X_train = X_train.drop(['count'], axis = 1)
X_test = X_test.drop(['count'], axis = 1)

### 타깃값

In [10]:
# 타깃값 따로 할당.
y = train['count']

In [11]:
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


## 평가지표 함수

In [12]:
import numpy as np


In [13]:
def rmsle(y_true, y_pred, convertExp=True):
    # 지수 변환
    if convertExp:
        # np.exp()는 밑이 자연상수인 e인 지수함(e^x)수로 변환
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
    
    # 로그변환 후 결측값을 0으로 반환
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output
    

# 모델 훈련

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
linear_reg_model = LinearRegression()

In [16]:
# 타깃값 로그변환
log_y = np.log(y)
# 모델 훈련
linear_reg_model.fit(X_train, log_y)

LinearRegression()

# 모델 성능 검증

In [17]:
preds = linear_reg_model.predict(X_train)

In [18]:
print(f'선형회귀의 RMSLE 값 : {rmsle(log_y, preds, True):.4f}')

선형회귀의 RMSLE 값 : 1.0205


In [19]:
linerregpreds = linear_reg_model.predict(X_test)
# 지수로 변환
submission['count'] = np.exp(linerregpreds)


In [20]:
# 파일 저장
# submission.to_csv('submission.csv',index=False)