In [4]:
%load_ext autotime
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import datetime as datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from azure.datalake.store import core, lib, multithread
from sklearn.externals import joblib

from datetime import date
from getpass import getpass

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2 ms


In [None]:
def pickle_read(path):
    with open(path,'rb') as f:
        df = pickle.load(f)
    return df

def pickle_write(data,path):
    with open(path, "wb") as f:
        pickle.dump(data, f)


In [None]:
## formula to calculate r^2

def r_square(df,pred,actual):
    df['y_pred_diff'] = (df[pred] - df[actual])**2
    actual_mean = df[actual].mean()
    df['y_mean_diff'] = (df[actual] - actual_mean)**2
    r2 =  1 - (sum(df['y_pred_diff']) / sum(df['y_mean_diff']))
    df.drop(columns = ['y_pred_diff','y_mean_diff'],inplace = True)
    return r2
## def a function to log transform columns to normalize it.

def logFunc(df, columns):
    for i in list(columns):
        df[(i+str(1))] = np.where(df[i]==0,0,np.log(df[i]))
        df[(i+str(1))] = round(df[(i+str(1))],2)
    return df

In [None]:
def _get_fsa(b0):
    b0['FSA'] = b0['POSTAL_CODE'].str.extract(r'(\w{3})')

In [None]:
def rmse_func(df,pred,actual):
    df['y_pred_diff'] = (df[pred] - df[actual])**2  
    rmse = np.sqrt(df.loc[:,'y_pred_diff'].mean())
    df.drop(columns = ['y_pred_diff'],inplace = True) 
    return rmse

In [None]:
## formula to calculate mae
## mean absolute error
def mae_func(df,pred,actual):
    df['y_pred_diff'] = abs(df[pred] - df[actual])  
    mae = df.loc[:,'y_pred_diff'].mean()
    df.drop(columns = ['y_pred_diff'],inplace = True) 
    return mae

In [None]:
def mape_func(df,pred,actual):
    df['y_pred_diff_percent'] = abs( (df[actual] - df[pred])/df[actual] )
    mape = df.loc[:,'y_pred_diff_percent'].mean()
    df.drop(columns = ['y_pred_diff_percent'],inplace = True)

    return mape

In [None]:
def maape_func(df,pred,actual):
    df['y_pred_diff_percent'] = np.arctan(abs( (df[actual] - df[pred])/df[actual] ))
    maape = df.loc[:,'y_pred_diff_percent'].mean()
    df.drop(columns = ['y_pred_diff_percent'],inplace = True)

    return maape

In [None]:
def vwmape_func(df,pred,actual):
    df['y_pred_diff_percent'] = abs(df[actual] - df[pred])
    vwmape = df['y_pred_diff_percent'].sum() / df[actual].sum()
    df.drop(columns = ['y_pred_diff_percent'],inplace = True)

    return vwmape

In [None]:
def Light_GBM(x_train,y_train,x_test,y_test,cat_vars,early_stopping_rounds,num_boost_round,verbose_eval):
    print()
    print(format('start train light gbm model with basic setup','*^82'))
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 1000,
    'learning_rate': 0.3,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
    
    evals_result = {}  # to record eval results for plotting
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=num_boost_round,
                valid_sets=lgb_eval,
                early_stopping_rounds=early_stopping_rounds,
                categorical_feature=cat_vars,
                evals_result=evals_result,
                verbose_eval=verbose_eval)
    
    print(format('basic model training is done','*^82')) 
    return gbm

In [None]:
## by using each feature's threshold, to generate the top features based on selection criteria

def LightGBM_features_refined(model,x_train,y_train,x_test,y_test,target):
    print()
    print(format('start feature refined selection based on feature ranks','*^82'))
    features = pd.DataFrame(columns = ['feature'],data = model.feature_importance(importance_type='gain'))
    features['featureName'] = model.feature_name()
    features.sort_values('feature',ascending = True,inplace = True)
    features = features.reset_index()
    
    train = pd.concat([x_train,y_train],axis = 1)
# select features using threshold
    test  = pd.concat([x_test,y_test],axis = 1)
    print('running accumulative features with corresponding metrix:')
    for i in range(0, len(features.featureName)):
        select_X_train = train[features.featureName[i:len(features.featureName)]]
        y_train1 = train[target]
        selection_model = lgb.LGBMRegressor()
        selection_model.fit(select_X_train, y_train1)
    	# eval model
        select_X_test = test[features.featureName[i:len(features.featureName)]]
        select_X_test.reset_index(inplace = True,drop = True)
        
        y_test1 = test[target]        
        y_test1.reset_index(inplace = True,drop = True)
        
        y_pred1 = selection_model.predict(select_X_test)
    	
        y_pred_data1 = pd.DataFrame(columns = ['Pred'],data = y_pred1)
       
        test11 = pd.concat([select_X_test,y_pred_data1,y_test1],axis = 1)  
        
        
        #test11['Pred'] = np.where(test11['Pred'] <0 ,0,test11['Pred'])
        test11['Pred'] = round(test11['Pred'],2)
        
        r2_v1 = r_square(test11,'Pred',target)
        mae = mean_absolute_error(y_test1, y_pred1)
        accuruacy  = mape_func(test11,'Pred',target)
        accuruacy_adj = vwmape_func(test11,'Pred',target)
        
        print("---%s, Thresh=%d, n=%d, R2: %.2f, mae: %.2f, accuruacy_adj: %.3f---" % (
                features['featureName'][i], features['feature'][i], select_X_train.shape[1], r2_v1,mae ,1-accuruacy_adj))
    print('please select your refined features based on this analysis')

In [None]:
def ParameterOptimizer(model,parameters,x_train,y_train,cat_vars,early_stopping_rounds,eval_set): 
    print()
    print(format('To find optimal parameters for lightGBM using GridSearchCV for Regression','*^82'))    
    
    import warnings
    warnings.filterwarnings("ignore")
    
    
    grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2,n_jobs = -1)
    grid.fit(x_train,y_train,categorical_feature=cat_vars,eval_metric = 'rmse',
             early_stopping_rounds =early_stopping_rounds,eval_set =eval_set,verbose = 0)    
    
    # Results from Grid Search
    print("\n========================================================")
    print(" Results from Grid Search " )
    print("========================================================")    
    
    print("\n The best estimator across ALL searched params:\n",
          grid.best_estimator_)
    
    
    print("\n The best score across ALL searched params:\n",
          grid.best_score_)
    
    print("\n The best parameters across ALL searched params:\n",
          grid.best_params_)
    
    print("\n ========================================================")
    return grid


In [None]:
### generate prediction on holdout data

def predData(x_test,y_test,bestModel,target,log= True):

    print(format("Start to predict on test/validation dataset",'*^82'))
    best_model= bestModel.best_estimator_
    y_pred = best_model.predict(x_test)
    predictions = [round(value) for value in y_pred]
    
    ## create test dataset with prediction
    y_pred_data = pd.DataFrame(columns = ['Pred'],data = y_pred)
    
    #train = pd.concat([x_train,y_train],axis = 1)
    test = pd.concat([x_test,y_test],axis = 1)
    test.reset_index(inplace = True)
    test1 = pd.concat([test,y_pred_data],axis = 1)
    

    test1['Pred'] = round(test1['Pred'],2)
    
    if log == True:
        test1['Pred'] = test1['Pred']
    if log == False:
        test1['Pred'] = np.where(test1['Pred'] <= 0, 0, test1['Pred'])

    print('Calculate r2 for test/validation data')
    r2_v1 = r_square(test1,'Pred',target)
    
    print("r2 for test data: %.2f" % (r2_v1))
    # eval
    print('The rmse of prediction is:', mean_squared_error(y_test, test1['Pred']) ** 0.5)
    return test1

In [None]:
## define a function for plotting graph

def modelPlot(bestModel):
    best_model= bestModel.best_estimator_
    #print('Plotting metrics recorded during training...')
    #ax = lgb.plot_metric(evals_result, metric='l1')
    #plt.show()
    
    print('Plotting feature importances...')
    ax = lgb.plot_importance(best_model, max_num_features=50)
    plt.show()

In [None]:
## plot by comparing prediction vs actuals
def predEval(test,prediction,actual):
     ## residule plot
    x = test[actual]
    y = test[prediction]

    fig,ax = plt.subplots()

    # Plot the data
    data_line = ax.plot(y,x,'or',label='_nolegend_')
    
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    plt.plot(x,p(x),"r--",color = 'black',label = 'Actual_trend')
    plt.plot(x,x,"r--",color = 'green',label = 'Ideal_trend')
    
    
    # add a title
    ax.set_title("Prediction vs Actuals")
    
    # add label
    ax.set_xlabel('Prediction')
    ax.set_ylabel('Actuals')
    
    # Make a legend
    legend = ax.legend(loc='upper left')
    
    plt.show()

In [None]:
## plot by comparing prediction vs actuals
def predEval(test,prediction,actual):
     ## residule plot
    x = test[actual]
    y = test[prediction]

    fig,ax = plt.subplots()

    # Plot the data
    data_line = ax.plot(y,x,'or',label='_nolegend_')
    
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    plt.plot(x,p(x),"r--",color = 'black',label = 'Actual_trend')
    plt.plot(x,x,"r--",color = 'green',label = 'Ideal_trend')
    
    
    # add a title
    ax.set_title("Prediction vs Actuals")
    
    # add label
    ax.set_xlabel('Prediction')
    ax.set_ylabel('Actuals')
    
    # Make a legend
    legend = ax.legend(loc='upper left')
    
    plt.show()

In [None]:
def residulPlot1(test,prediction,actual,bins):
    ## residule plot
    x = test[actual]
    difference = test[prediction] - x

    fig,ax = plt.subplots()

    # Plot the data
    data_line = ax.hist(difference, bins = bins, color = 'blue', edgecolor = 'black')
   
    # add a title
    ax.set_title("Residual Histogram")
    
    # add label
    ax.set_xlabel('Residual')
    ax.set_ylabel('Freq')

    plt.show()

In [1]:
## This a function that will be used by a hypotetic user.

def conf_int_user(df, target, level={'FSA': None, 'FULL_TERM_CLASS':None , 'DRIVER_SEX_CODE_D1':None}, pred = None, 
                  nb_samples=150, alpha = 0.1, adjust_int = True):
    """This function takes optional levels and a optional prediction (which normally will not be provided by the 
    user). It returne the average of the predictions at that levels, the confidence interval around it, 
    and the mean of the truth at those level (this last one will be useful for some statistical 
    studies, but will not be showed to a normal user).
    arguments:
    df: is the holdout data set having all features, truth and predictions, 
    alpha: determines the confidence level = 100*(1-2*alpha),
    adjust_int= True: will adjust the confidence interval to contains 'pred'. However, if 'pred' is not provided it is 
    unlikely that the provided prediction falls outside the confidence interval. 
    outputs: 
    pred: is the predicted values given the level. It is average of the predictions, using the data 
    set that is used for computing the confidence interval,
    q_mn: lower bound of the confidence interval,
    q_mx: upper bound of the confidence interval,
    was_out: True if the predicted values for 'pred' was outside of the confidence interval befor adjusting it,
    avg_truth: the average of the actual values in the dataset used to produce confidence interval. 
    
    """
    df1 = df.copy()
    if level is not None:
        for k , v in level.items():
            if v is not None: 
                if df1[df1[k] == v].shape[0] >= nb_samples:
                    df1 =df1[df1[k] ==v]
    if pred is None: 
        pred = df1.Pred.mean()
    if df1.shape[0] > nb_samples:
        k=1
        lent =0
        while lent < nb_samples: 
            df2= df1[(df1.Pred < pred+5*k) & (df1.Pred > pred-5*k)]
            lent = df2.shape[0]
            k+=1  

    avg_truth = df2[target].mean()
    res = df2[target] - df2.Pred
    q_mn , q_mx = res.quantile(alpha, interpolation='higher') , res.quantile(1-alpha, interpolation='lower')
    was_out = q_mn*q_mx > 0
    q_mn = pred+q_mn
    q_mx = pred+q_mx
    if adjust_int:
        q_mn = np.amin([q_mn, pred])
        q_mx = np.amax([q_mx, pred])
    return (q_mn, pred, q_mx, was_out, avg_truth)

In [None]:
def preprocessing(silver_df, datalake=False):
    features = ['AGE_D1',
                'POSTAL_CODE',
                'COMPANY_CODE',
                'COVERAGE_CODE_AUTOMOBILE',
                'DISTANCE_DRIVEN_ANNUALLY',
                'VEHICLE_CODE1',
                'DRIVING_AGE',
                'VEHICLE_AGE',
                'MULTICAR_DISCOUNT_INDICATOR',
                'DRIVERS_NUM',
                'FULL_TERM_PREMIUM_COVERAGE',
                'DRIVER_SEX_CODE_D1',
                'AUTO_CLAIMS',
                'CONVICTION_NUM',
                'DRIVER_MARITAL_STATUS_CODE_D1',
                'VIN_SERIAL_NUMBER',
                'COVERAGE_CODE_AUTOMOBILE_CLASS'
                ]
    train_df = silver_df[features]
    num_cols =['AGE_D1',
               'DEDUCTIBLE_AMOUNT',
               'DISTANCE_DRIVEN_ANNUALLY',
               'LIMIT_1',
               'VEHICLE_RATE_GROUP',
               'DRIVING_AGE',
               'VEHICLE_AGE',
               'DRIVERS_NUM',
               'AUTO_CLAIMS',
               'CONVICTION_NUM',
               'FULL_TERM_PREMIUM_COVERAGE'
                ]

    target = ['FULL_TERM_PREMIUM_COVERAGE']

    cat_cols = ['COMPANY_CODE',
                'COVERAGE_CODE_AUTOMOBILE',
                'VEHICLE_CODE1',
                'MULTICAR_DISCOUNT_INDICATOR',
                'POSTAL_CODE',
                'DRIVER_SEX_CODE_D1',
                'DRIVER_MARITAL_STATUS_CODE_D1',
                'NEW_CITY_NAME',
                'VIN_SERIAL_NUMBER',
                'PROVINCE_STATE_ABBREVIATION'
                ]

    for col in num_cols:
        train_df[col] =pd.to_numeric(train_df[col],errors='coerce')
    train_df = train_df[train_df['DRIVING_AGE'].notnull()]
    train_df = train_df[train_df['VEHICLE_CODE1'].notnull()]
    train_df = train_df[train_df['VIN_SERIAL_NUMBER'].notnull()]
    train_df = train_df[train_df['DISTANCE_DRIVEN_ANNUALLY'].notnull()]
    train_df = train_df[train_df['CONVICTION_NUM'].notnull()]
    train_df = train_df[train_df['DRIVER_SEX_CODE_D1'].notnull()]
    train_df = train_df[train_df['DRIVER_MARITAL_STATUS_CODE_D1'].notnull()]
    train_df = train_df[train_df['FSA'].notnull()]
    train_df[['LIMIT_1','DEDUCTIBLE_AMOUNT','FULL_TERM_PREMIUM_COVERAGE']] = train_df[['LIMIT_1','DEDUCTIBLE_AMOUNT','FULL_TERM_PREMIUM_COVERAGE']].fillna(0)
    train_df['POSTAL_CODE'] = train_df['POSTAL_CODE'].str.extract(r'([A-Z]\d\w\d\w\d)',expand=False)
    train_df = train_df[train_df['POSTAL_CODE'].notnull()]
    train_df['MULTICAR_DISCOUNT_INDICATOR'] = train_df['MULTICAR_DISCOUNT_INDICATOR'].apply(lambda x: 1 if x == 'Y' else 0)
    pd.options.display.float_format = '{:,.2f}'.format
    #set range for age
    train_df = train_df[(train_df['AGE_D1']>=15)&(train_df['AGE_D1']<=95)]
    #set range for full_term_premium
    train_df= train_df[(train_df['FULL_TERM_PREMIUM_COVERAGE']>-200)&(train_df['FULL_TERM_PREMIUM_COVERAGE']<1000)]
    #set range for DRIVING_AGE
    train_df = train_df[(train_df['DRIVING_AGE']>=0)&(train_df['DRIVING_AGE']<=85)]
    
    input_cols = list(set(train_df.columns) - set(target))
    
    x_train = train_df[input_cols]
    y_train = train_df[target]
    
    if datalake ==True:
        with adl.open('/LeadGen/Raw/Silver/AUTO/Train_data/x_train_'+today+'.pkl', 'wb') as f:
            pickle.dump(x_train,f,pickle.HIGHEST_PROTOCOL)
        print('file has been created')
        with adl.open('/LeadGen/Raw/Silver/AUTO/Train_data/y_train_'+today+'.pkl', 'wb') as f:
            pickle.dump(y_train,f,pickle.HIGHEST_PROTOCOL)
        print('file has been created')
    else:
        
        return x_train,y_train