# This notebook will show how to train a complete models in some popular tools: random forest regressor, xgboost and lightGBM

    Instructor: Yimin Nie
    Email: ymnie888@gmail.com
    
    In the notebook, I show you the entire pipeline using taxi trip data set, and show how to put all workable codes into 
    a python project to run your code

## 1. Import useful libs 

In [None]:
import pandas as pd
import numpy as np
import datetime
import gc
import math
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

## 2. Load data

In [None]:
data_path = "C:\\Users\\enxxmnx\\OneDrive - Ericsson AB\\workspace\\training\\data\\taxi_trip\\"

In [None]:
usecols = ['pickup_datetime', 'dropoff_datetime', 'store_and_fwd_flag', 'pickup_longitude',
           'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
           'trip_duration'
           ]
df = pd.read_csv(data_path + '\\train.zip', compression='zip', usecols=usecols)

## 3. process the data and extract features 

In [None]:
def distance(lon1, lat1, lon2, lat2):
    radius = 6371  # km
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c
    return d

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()
df['date'] = df['pickup_datetime'].dt.date
condition = (df['pickup_longitude'] != 0) & (df['pickup_latitude'] != 0) & (df['dropoff_longitude'] != 0) \
            & (df['dropoff_latitude'] != 0) & (df['trip_duration'] > 0)
df = df.loc[condition]
df.sort_values('pickup_datetime', inplace=True, ascending=True)
df['month'] = df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour
df['min'] = df['pickup_datetime'].dt.minute
df['dow'] = df['pickup_datetime'].dt.weekday
df['doy'] = df['pickup_datetime'].dt.dayofyear
df.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'N': 0, 'Y': 1})
features = ['store_and_fwd_flag', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', \
            'dropoff_latitude', 'passenger_count', 'month', 'day', 'hour', 'min', 'dow', 'doy'
            ]

df['dist'] = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].apply(
    lambda x: distance(x[0], x[1], x[2], x[3]), axis=1)
test = df.loc[df.date >= datetime.date(2016, 6, 1)]
train = df.loc[df.date < datetime.date(2016, 6, 1)]
y_train = np.log1p(train['trip_duration']).values
X_train = train[features].reset_index(drop=True)
del train, df
gc.collect()

## 4. Build your models


    before building your models, make sure 
        (1) your target ( regression or classification)
        (2) evaluation metric in terms of your target
        (3) how to train your model (here I use 5-fold cross validation)
        

### Define some models 

In [None]:
def model_lgb():
    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'learning_rate': 0.02,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': -1,
    'silent':-1,
    "max_depth": 10,
    "num_leaves": 128,
    "max_bin": 512,
    "n_estimators": 100000
}
    model = lgb.LGBMRegressor(**params)
    return model

def model_xgb():
    model = xgb.XGBRegressor(colsample_bytree=0.4,
                     gamma=0,                 
                     learning_rate=0.07,
                     max_depth=3,
                     min_child_weight=1.5,
                     n_estimators=10000,                                                                    
                     reg_alpha=0.75,
                     reg_lambda=0.45,
                     subsample=0.6,
                     seed=42
                ) 
    return model

def model_rf():
    model = RandomForestRegressor(
        n_estimators=config.n_estimator,
        max_depth = config.max_depth,
        random_state=config.seed,
        n_jobs=config.n_jobs,
    )
    return model


### use k-fold CV

In [None]:
kf = KFold(5)
cv_scores = []
model_name = 'xgb'
for i, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
    print('FOLD {} \n'.format(i))
    X_tr, y_tr = X_train.loc[tr_idx], y_train[tr_idx]
    X_vl, y_vl = X_train.loc[vl_idx], y_train[vl_idx]

    if model_name == 'lgb':
        model = model_lgb()
        model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_vl, y_vl)], \
                  eval_metric='rmse', verbose=200, early_stopping_rounds=500)
        with open('lgb_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()
        
    if model_name == 'rf':
        model = model_rf()
        model.fit(X_tr, y_tr)
        with open('rf_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()
        
    if model_name == 'xgb':
        model = model_xgb()
        train_data  = xgb.DMatrix(X_tr, label=y_tr)
        valid_data  = xgb.DMatrix(X_vl, label=y_vl)
        evallist = [(train_data, 'train'), (valid_data, 'valid')]
        parms = {'max_depth':15, #maximum depth of a tree 8 12
         'objective':'reg:linear',
         'eta'      :0.05, #0.3
         'subsample':0.9,#SGD will use this percentage of data 0.8 0.99
         'lambda '  :3, #L2 regularization term,>1 more conservative 4 
         'colsample_bytree ':0.6, #0.9
         'colsample_bylevel':0.7, #1 0.7
         'min_child_weight': 0.5, #10 0.5
         #'nthread'  :3 ... default is max cores
         'eval_metric':'rmse'}  #number of cpu core to use
        # running for 2k iterations 
        model = xgb.train(parms, train_data, num_boost_round=2000, evals = evallist,
                          early_stopping_rounds=50, maximize=False, 
                          verbose_eval=100)
#         model.fit(X_tr, y_tr,eval_set=(X_vl, y_vl))
        with open('rf_model_{}.pkl'.format(i), 'wb') as handle:
            pickle.dump(model, handle)
        del model, X_tr, X_vl
        gc.collect()