In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [3]:
print('Read data')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Read data


In [4]:
# datetime 형태로 변형 => datetype을 변경하면 시간대별 추출이 쉬움
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [5]:
# store and fwd flag 컬럼 데이터를 label encoder 변환을 통해 수치형 데이터로
le = LabelEncoder()
le.fit(train['store_and_fwd_flag'])
train['store_and_fwd_flag'] = le.transform(train['store_and_fwd_flag'])
test['store_and_fwd_flag'] = le.transform(test['store_and_fwd_flag'])

In [6]:
# 새로운 피처 생성 1. datetime 데이터에서 월, 일, 요일, 시간, 분을 추출
print('Create features')
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['weekday'] = train['pickup_datetime'].dt.weekday
train['hour'] = train['pickup_datetime'].dt.hour
train['minute'] = train['pickup_datetime'].dt.minute

test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['weekday'] = test['pickup_datetime'].dt.weekday
test['hour'] = test['pickup_datetime'].dt.hour
test['minute'] = test['pickup_datetime'].dt.minute

Create features


In [7]:
# 새로운 피처 생성 2. 픽업, 드랍 거리 측정
# longitude distance
train['dist_long'] = train['pickup_longitude'] - train['dropoff_longitude']
test['dist_long'] = test['pickup_longitude'] - test['dropoff_longitude']

# latitude distance
train['dist_lat'] = train['pickup_latitude'] - train['pickup_latitude']
test['dist_lat'] = test['pickup_latitude'] - test['pickup_latitude']

# distance : 피타고라스 정리를 이용하여 거리 측정
train['dist'] = np.sqrt(np.square(train['dist_long']) + np.square(train['dist_lat']))
test['dist'] = np.sqrt(np.square(test['dist_long']) + np.square(test['dist_lat']))

In [8]:
# spatial features : count and speed
# 픽업 드랍 위치를 소수점 셋째자리까지
train['pickup_longitude_bin'] = np.round(train['pickup_longitude'],2)
train['pickup_latitude_bin'] = np.round(train['pickup_latitude'],2)
train['dropoff_longitude_bin'] = np.round(train['dropoff_longitude'],2)
train['dropoff_latitude_bin'] = np.round(train['dropoff_latitude'],2)

test['pickup_longitude_bin'] = np.round(test['pickup_longitude'],2)
test['pickup_latitude_bin'] = np.round(test['pickup_latitude'],2)
test['dropoff_longitude_bin'] = np.round(test['dropoff_longitude'],2)
test['dropoff_latitude_bin'] = np.round(test['dropoff_latitude'],2)

# count features
# train test을 합치고 pickup 위도와 경도로 묶어줌 -> size 함수로 각 위치에 데이터가 몇개 있는지 a, b 객체에 저장
a = pd.concat([train, test]).groupby(['pickup_longitude_bin', 'pickup_latitude_bin']).size().reset_index()
b = pd.concat([train, test]).groupby(['dropoff_longitude_bin', 'dropoff_latitude_bin']).size().reset_index()

# train data와 각 위치별 몇개의 데이터가 있는지 저장된 객체 a를 left join해줌
train = pd.merge(train, a, on=['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')
test = pd.merge(test, a, on=['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')

train = pd.merge(train, b, on=['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')
test = pd.merge(test, b, on=['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')


In [9]:
#speed features : 100000 * 거리 / 시간 : 거리단위가 위도, 경도 단위인데 speed 단위가 가능한가?
train['speed'] = 100000 * train['dist'] / train['trip_duration']

In [12]:
# train data에서 픽업 위도 경도 별로 그룹핑 한 후 속도의 평균을 구한 a 객체 생성
a = train[['speed', 'pickup_longitude_bin', 'pickup_latitude_bin']].groupby(['pickup_longitude_bin', 'pickup_latitude_bin']).mean().reset_index()
a = a.rename(columns = {'speed' : 'ave_speed'})
b = train[['speed', 'dropoff_longitude_bin', 'dropoff_latitude_bin']].groupby(['dropoff_longitude_bin', 'dropoff_latitude_bin']).mean().reset_index()
b = b.rename(columns = {'speed' : 'ave_speed'})

In [17]:
# 원래 train, test data에 위치별 평균 스피드 추가하기
train = pd.merge(train, a, on = ['pickup_longitude_bin', 'pickup_latitude_bin'], how = 'left')
test = pd.merge(test, a, on = ['pickup_longitude_bin', 'pickup_latitude_bin'], how = 'left')

train = pd.merge(train, b, on = ['dropoff_longitude_bin', 'dropoff_latitude_bin'], how = 'left')
test = pd.merge(test, b, on = ['dropoff_longitude_bin', 'dropoff_latitude_bin'], how = 'left')


  train = pd.merge(train, a, on = ['pickup_longitude_bin', 'pickup_latitude_bin'], how = 'left')
  test = pd.merge(test, b, on = ['dropoff_longitude_bin', 'dropoff_latitude_bin'], how = 'left')


In [19]:
# drop bins
train = train.drop(['speed', 'pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis = 1)
test = test.drop(['speed', 'pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis = 1)



KeyError: "['speed' 'pickup_longitude_bin' 'pickup_latitude_bin'\n 'dropoff_longitude_bin' 'dropoff_latitude_bin'] not found in axis"

In [20]:
# train.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'month', 'day', 'weekday', 'hour', 'minute',
       'dist_long', 'dist_lat', 'dist', '0_x', '0_y', 'ave_speed_x',
       'ave_speed_y', 'ave_speed_x', 'ave_speed_y', 'ave_speed_x',
       'ave_speed_y', 'ave_speed'],
      dtype='object')

In [24]:
## weather data 불러오기
weather = pd.read_csv('KNYC_Metars.csv')
#weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8787 entries, 0 to 8786
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Time        8787 non-null   object 
 1   Temp.       8787 non-null   float64
 2   Windchill   2295 non-null   float64
 3   Heat Index  815 non-null    float64
 4   Humidity    8787 non-null   float64
 5   Pressure    8556 non-null   float64
 6   Dew Point   8787 non-null   float64
 7   Visibility  8550 non-null   float64
 8   Wind Dir    8787 non-null   object 
 9   Wind Speed  8787 non-null   float64
 10  Gust Speed  8787 non-null   float64
 11  Precip      8787 non-null   float64
 12  Events      8787 non-null   object 
 13  Conditions  8787 non-null   object 
dtypes: float64(10), object(4)
memory usage: 961.2+ KB


In [25]:
weather['Time']

0       2015-12-31 02:00:00
1       2015-12-31 03:00:00
2       2015-12-31 04:00:00
3       2015-12-31 05:00:00
4       2015-12-31 06:00:00
               ...         
8782    2017-01-01 21:00:00
8783    2017-01-01 22:00:00
8784    2017-01-01 23:00:00
8785    2017-01-02 00:00:00
8786    2017-01-02 01:00:00
Name: Time, Length: 8787, dtype: object

In [26]:
weather['Time'] = pd.to_datetime(weather['Time'])
weather['year'] = weather['Time'].dt.year
weather['month'] = weather['Time'].dt.month
weather['day'] = weather['Time'].dt.day
weather['hour'] = weather['Time'].dt.hour
weather = weather[weather['year'] == 2016]

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458644 entries, 0 to 1458643
Data columns (total 28 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   id                  1458644 non-null  object        
 1   vendor_id           1458644 non-null  int64         
 2   pickup_datetime     1458644 non-null  datetime64[ns]
 3   dropoff_datetime    1458644 non-null  object        
 4   passenger_count     1458644 non-null  int64         
 5   pickup_longitude    1458644 non-null  float64       
 6   pickup_latitude     1458644 non-null  float64       
 7   dropoff_longitude   1458644 non-null  float64       
 8   dropoff_latitude    1458644 non-null  float64       
 9   store_and_fwd_flag  1458644 non-null  int64         
 10  trip_duration       1458644 non-null  int64         
 11  month               1458644 non-null  int64         
 12  day                 1458644 non-null  int64         
 13  weekday     

In [29]:
# train, test data 와 날씨 데이터 결합
train = pd.merge(train, weather[['Temp.', 'month', 'day', 'hour']], on = ['month', 'day', 'hour'], how = 'left')
test = pd.merge(test, weather[['Temp.', 'month', 'day', 'hour']], on = ['month', 'day', 'hour'], how = 'left')


In [30]:
## train / test features, y, id
xtrain = train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration'], axis = 1).values
xtest = test.drop(['id', 'pickup_datetime',], axis =1).values

ytrain = train['trip_duration'].values

id_train = train['id'].values
id_test = test['id'].values



In [31]:
del(train, test)

In [44]:
### xgb parameter
params = {
    'booster' : 'gbtree',
    'obgective' : 'reg:squarederror',
    'learning_rate' : 0.1,
    'max_depth' : 14,
    'subsample' : 0.8,
    'colsample_bytree': 0.7,
    'colsample_bylevel' : 0.7,
    'verbosity' : 1
}

In [48]:
## number of rounds
nrounds = 20

In [42]:
np.log(ytrain+1)

array([6.12249281, 6.49828215, 7.66152708, ..., 6.63987583, 5.9242558 ,
       5.29330482])

In [None]:
## train model
print('Train model...')
# xgb를 사용하기 위해 훈려 ㄴ데이터를 dmatrix형태로 변환
# ytrain 데이터를 np.log(ytrain+1)을 하는 이유??? 아마 정규화와 관련있지 않을까,,
dtrain = xgb.DMatrix(xtrain, np.log(ytrain+1))
gbm = xgb.train(params,
               dtrain,
               num_boost_round = nrounds)

Train model...
Parameters: { "obgective" } are not used.



In [46]:
## test predictions
pred_test = np.exp(gbm.predict(xgb.DMatrix(xtest))) -1

NameError: name 'gbm' is not defined

In [47]:
## create submisson
df = pd.DataFrame({'id' : id_test, 'trip_duration' : pred_test})
df = df.set_index('id')
df.to_csv('sub_bench.csv', index=True)

NameError: name 'pred_test' is not defined