### This notebook makes use of xgb regressor model

Private Score - 1748462.28317

Public Score - 1689503.56491

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip')
train_df.head()

In [None]:
# remove outliers
def remove_outlier(col):
  sorted(col)
  Q1,Q3 = col.quantile([0.25,0.75])
  IQR = Q3-Q1
  lower_range = Q1 - (1.5 * IQR)
  upper_range = Q3 + (1.5 * IQR)
  return lower_range,upper_range

In [None]:
lowerrevenue, upperrevenue = remove_outlier(train_df['revenue'])
train_df['revenue'] = np.where(train_df['revenue']>upperrevenue ,
                               upperrevenue,train_df['revenue'])
train_df['revenue'] = np.where(train_df['revenue']<lowerrevenue ,
                               lowerrevenue,train_df['revenue'])
train_df.head()

In [None]:
# subtract todays date from the open date
today = pd.to_datetime('today')
train_df[['days']] = train_df[['Open Date']].apply(pd.to_datetime)
train_df['days'] = (today - train_df['days'] ).dt.days
train_df.head()

In [None]:
# Extract month index
def date_features(df):
  df['month'] = pd.DatetimeIndex(df['Open Date']).month
  return df
train_df=date_features(train_df)

In [None]:
final_df = train_df.loc[:,'City Group':'month']
final_df.head()

In [None]:
# Import label encoder 
from sklearn import preprocessing   
label_encoder = preprocessing.LabelEncoder() 
final_df['Type']= label_encoder.fit_transform(final_df['Type'])
final_df['City Group']= label_encoder.fit_transform(final_df['City Group'])
final_df.head()

# Training

In [None]:
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

In [None]:
x_train, y_train = final_df.drop('revenue', axis=1), final_df['revenue']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

In [None]:
params_xgb = {
    'learning_rate': [.1, .05, ],
    'colsample_bytree': [.3, .4, .5, .6],
    'max_depth': [1],
    'alpha': [3],
    'subsample': [.5],
    'n_estimators': [30, 70, 100, 200]
}

xgb_model = XGBRegressor()

cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=50)
xgb_regressor = GridSearchCV(xgb_model, params_xgb, 
                             scoring='neg_root_mean_squared_error', 
                             cv = cv, n_jobs = -1)
xgb_regressor.fit(x_train, y_train)
print(f'Optimal lr: {xgb_regressor.best_params_["learning_rate"]}')
print(f'Optimal colsample_bytree: {xgb_regressor.best_params_["colsample_bytree"]}')
print(f'Optimal n_estimators: {xgb_regressor.best_params_["n_estimators"]}')
print(f'max_depth: {xgb_regressor.best_params_["max_depth"]}')
print(f'Best score: {xgb_regressor.best_score_}')

# Testing

In [None]:
test_df = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip')
test_df.loc[test_df['Type']=='MB', 'Type'] = 'DT'
print(test_df['City Group'].unique(),test_df['City Group'].nunique())
print(test_df['Type'].unique(),test_df['Type'].nunique())
print(test_df['Type'].value_counts())
today = pd.to_datetime('today')
test_df[['days']] = test_df[['Open Date']].apply(pd.to_datetime) 
test_df['days'] = (today - test_df['days'] ).dt.days
test_df=date_features(test_df)
test_df.head()

In [None]:
test_df =test_df.loc[:,'City Group':'month']
# Import label encoder 
from sklearn import preprocessing   
label_encoder = preprocessing.LabelEncoder() 
test_df['Type']= label_encoder.fit_transform(test_df['Type'])
test_df['City Group']= label_encoder.fit_transform(test_df['City Group'])
x_test = scaler.transform(test_df)
x_test = pd.DataFrame(x_test)
x_test.head()

# Submission

In [None]:
sub = pd.read_csv('../input/restaurant-revenue-prediction/sampleSubmission.csv')
submission = pd.DataFrame(columns=['Id','Prediction'])
submission['Id'] = sub['Id']
pred = xgb_regressor.predict(x_test)
submission['Prediction'] = pred
submission.head()

In [None]:
submission.to_csv('./submission_xgb_0000.csv',index=False)