### Declare libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
# evaluate an xgboost regression model on the housing dataset
import numpy as np
import pandas as pd
# libraries for visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme(style="white")
import os
# library to save the money
import pickle

### Declare functions and select the variables that i am going to use

In [2]:
# función creada por mi para traer los NA's
def get_na(df):
    qsna=df.shape[0]-df.isnull().sum(axis=0)
    qna=df.isnull().sum(axis=0)
    ppna=round(100*(df.isnull().sum(axis=0)/df.shape[0]),2)
    aux= {'datos sin NAs en q': qsna, 'Na en q': qna ,'Na en %': ppna}
    na=pd.DataFrame(data=aux)
    return na.sort_values(by='Na en %',ascending=False)

### Set folder location of the data and load data 

In [3]:
# you need to change it if you are going to run it on your local machine, writing the folder where the raw data of the challenge is
os.chdir('/Users/iairlinker/Documents/repos/cornershop_test/data')
# Loading pre-processed dataset
model_data = pd.read_csv('interim/results.csv', sep=';')
model_data['is_weighted'] = model_data.KG.apply(lambda x: 1 if x>0 else 0)
model_data['on_demand'] = model_data.on_demand.apply(lambda x: 1 if x>0 else 0)
col_names = ['on_demand', 'is_weighted','UN','picking_speed', 
             'seniority','order_stores_distance',
             'total_minutes',
            ]                    
X = model_data[col_names]
y = X.total_minutes
X.drop('total_minutes', axis=1, inplace=True)
X = pd.get_dummies(X)
get_na(X)

Unnamed: 0,datos sin NAs en q,Na en q,Na en %
UN,1995,5,0.25
on_demand,2000,0,0.0
is_weighted,2000,0,0.0
picking_speed,2000,0,0.0
order_stores_distance,2000,0,0.0
seniority_ADVANCED,2000,0,0.0
seniority_BEGINNER,2000,0,0.0
seniority_INTERMEDIATE,2000,0,0.0
seniority_REVIEW,2000,0,0.0


### Load de models into objects and make the predictions which follows this rules:

* if there is NA on the dataset it's use a random forest regression 

* if there is not NA on the dataset it's use gradient boosting with quantile parameter on the loss function

In [4]:
file_name = "processed/gbq_reg.pkl"
# load the model
model_gbq_loaded = pickle.load(open(file_name, "rb"))

In [5]:
file_name = "processed/xgb_reg.pkl"
# load the model
model_xgb_loaded = pickle.load(open(file_name, "rb"))

In [6]:
# predict
model_data.loc[X.isna().any(axis=1), 'total_minutes'] = model_xgb_loaded.predict(X[X.isna().any(axis=1)])
model_data.loc[~X.isna().any(axis=1),'total_minutes'] = model_gbq_loaded.predict(X[~X.isna().any(axis=1)])
model_data.head()

Unnamed: 0,order_id,orders_lat,orders_lng,promised_time,on_demand,shopper_id,store_branch_id,total_minutes,promised_time_hours,promised_time_id,...,stores_lng,stores_location,store_district,seniority,found_rate,picking_speed,accepted_rate,rating,order_stores_distance,is_weighted
0,3a226ea48debc0a7ae9950d5540f2f34,-32.987022,-71.544842,2019-10-19 14:54:00+00:00,1,a5b9ddc0d82e61582fca19ad43dbaacb,07563a3fe3bbe7e3ba84431ad9d055af,98.735652,2019-10-19 14:00:00,19-14,...,-71.545615,"Mall Espacio Urbano, 961, Avenida Benidorm, Po...",Viña del Mar,INTERMEDIATE,0.8313,2.57,0.76,4.92,2.35124,0
1,9bf29b56619fcaf60b52690a848e10bb,-33.330724,-70.547074,2019-10-18 23:47:00+00:00,1,61e4ad15c3ff928840ebd34407055b33,33e75ff09dd601bbe69f351039152189,105.094171,2019-10-18 23:00:00,18-23,...,-70.515415,"Mall Portal La Dehesa, 1445, Avenida La Dehesa...",Lo Barnechea,ADVANCED,0.8946,1.82,1.0,4.84,4.188461,0
2,299d948a5fd2cf2a921894b9bd24b94e,-33.349922,-70.522841,2019-10-18 21:54:00+00:00,1,1456fc09701783b29f69e8f68c029879,1679091c5a880faf6fb5e6087eb1b2dc,124.740384,2019-10-18 21:00:00,18-21,...,-70.51242,"Lider, 12916, Avenida Las Condes, Lo Barnechea...",Lo Barnechea,ADVANCED,0.9363,1.67,1.0,4.84,2.506907,1
3,150bd9290b2125e67541098173e2cfb1,-33.430068,-70.571788,2019-10-19 16:00:00+00:00,0,dde31e647b83a2ab0c17040007596eaa,45fbc6d3e05ebd93369ce542e8f2322d,132.867181,2019-10-19 16:00:00,19-16,...,-70.53545,"Lider, 9140, Avenida Príncipe de Gales, La Rei...",La Reina,ADVANCED,0.8856,1.27,0.92,4.8,3.599359,1
4,844f746ff505c01c088de90bce067b94,-33.427389,-70.605832,2019-10-19 15:27:00+00:00,1,422af862430bd822fe24914e609b512c,08f38e0434442128fab5ead6217ca759,59.359225,2019-10-19 15:00:00,19-15,...,-70.603,"SalcoBrand, 1185, Avenida Los Leones, Providen...",Providencia,INTERMEDIATE,0.856,1.79,0.84,4.84,0.412939,0


In [7]:
# save the file
model_data[['order_id','total_minutes']].to_csv('processed/predictions.csv', sep=',',index=False)