# Predicting Food delivery time

## Load datasets & Cleaning

In [63]:
import pandas as pd
from sklearn.linear_model import LinearRegression

#Loading datasets
Train_file = "Data_Train.xlsx"
Submission_file = "Sample_Submission.xlsx"

#Changing data_types
data_types = {
    "Average_Cost":"float64",
    "Minimum_Order":"int64",
    "Delivery_Time":"int64",
    "Restaurant": "int64"
}

#replacing values for changing data_types
def f_replace_rupee(x):
    replace_items = ["₹", ","]
    val = x
    for item in replace_items:
        val = val.replace(item, "")
    return val

def f_replace_delivery(x):
    return x.replace(" minutes", "")

def f_replace_restaurant(x):
    return x.replace("ID_", "")

#Applying converter functions above to columns : Dictionary {Key : Values}
converters_func = {
    "Average_Cost": f_replace_rupee,
    "Minimum_Order": f_replace_rupee,
    "Delivery_Time": f_replace_delivery,
    "Restaurant": f_replace_restaurant
}

#Reading file with above amendments 
df_train = pd.read_excel(Train_file, 
                         converters = converters_func, 
                         keep_default_na = False, 
                         na_values=["for", "NEW", "-", "Opening Soon", "Temporarily Closed"])

df_train = df_train.astype(data_types)

#Fill NA

df_train['Average_Cost'].fillna(df_train['Average_Cost'].mean(),inplace=True)
df_train['Minimum_Order'].fillna(df_train['Minimum_Order'].mean(), inplace=True)
df_train['Rating'].fillna(df_train['Rating'].mean(), inplace=True)
df_train['Votes'].fillna(df_train['Votes'].mean(), inplace=True)
df_train['Reviews'].fillna(df_train['Reviews'].mean(),inplace=True)

df_train.sort_values('Average_Cost', ascending=True, inplace=True, na_position='first')
df_train.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
4162,3167,"Sandhurst Road, Mumbai CST Area","Sandwich, Street Food",50.0,50,3.61346,244.544457,123.247893,30
10616,1025,"Sector 3, Marathalli","Bengali, Fast Food",50.0,50,3.4,7.0,5.0,30
800,7271,"Chatta Bazaar, Malakpet, Hyderabad",South Indian,50.0,50,3.1,98.0,10.0,30
802,1769,"Babarpur, New Delhi, Delhi",Street Food,50.0,50,3.5,23.0,5.0,30
5781,629,"Chatta Bazaar, Malakpet, Hyderabad","Street Food, Fast Food, Beverages, Desserts, M...",50.0,50,3.61346,244.544457,123.247893,30


In [75]:

#Loading datasets
Test_file = "Data_Test.xlsx"

#Changing data_types

test_data_types = {
    "Average_Cost":"float64",
    "Minimum_Order" :"int64",
    "Restaurant" : "int64"
}

#replacing values for changing data_types

def f_test_replace_rupee(x):
    test_replace_items = ["₹",","]
    val = x
    for item in test_replace_items:
       val = val.replace(item,"")
    return val


def f_test_restaurant(x):
    return x.replace('ID_',"")


#Applying conventer

test_converter_func = {
    'Average_Cost' :f_test_replace_rupee,
    'Minimum_Order':f_test_replace_rupee,
    'Restaurant':f_test_restaurant
}

#Reading fil with above amendments

df_test = pd.read_excel(Test_file, 
                        converters=test_converter_func,
                        keep_default_na = False,
                        na_values = ["for","NEW","-", "Opening Soon", "Temporarily Closed"]
                       )

df_test = df_test.astype(test_data_types)


df_test["Average_Cost"].fillna(df_test["Average_Cost"].mean(), inplace=True)
df_test["Minimum_Order"].fillna(df_test["Minimum_Order"].mean(), inplace=True)
df_test["Rating"].fillna(df_test["Rating"].mean(),inplace=True)
df_test["Votes"].fillna(df_test["Votes"].mean(),inplace=True)
df_test["Reviews"].fillna(df_test["Reviews"].mean(),inplace=True)

df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Data columns (total 8 columns):
Restaurant       2774 non-null int64
Location         2774 non-null object
Cuisines         2774 non-null object
Average_Cost     2774 non-null float64
Minimum_Order    2774 non-null int64
Rating           2774 non-null float64
Votes            2774 non-null float64
Reviews          2774 non-null float64
dtypes: float64(4), int64(2), object(2)
memory usage: 173.5+ KB


Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,2842,"Mico Layout, Stage 2, BTM Layout,Bangalore","North Indian, Chinese, Assamese",350.0,50,4.2,361.0,225.0
1,730,"Mico Layout, Stage 2, BTM Layout,Bangalore","Biryani, Kebab",100.0,50,3.600044,226.928315,111.410821
2,4620,"Sector 1, Noida",Fast Food,100.0,50,3.6,36.0,16.0
3,5470,"Babarpur, New Delhi, Delhi","Mithai, North Indian, Chinese, Fast Food, Sout...",200.0,50,3.6,66.0,33.0
4,3249,"Sector 1, Noida","Chinese, Fast Food",150.0,50,2.9,38.0,14.0


## Combining datasets for one-hot encoding

In [76]:
df_train['check'] = 'tr'
df_test['check'] = 'ts'

In [78]:
df_combined = pd.concat([df_train, df_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [81]:
df_location_dummies = pd.get_dummies(df_combined['Location'], drop_first=True)

In [83]:
df_combined['Cuisines'] = df_combined['Cuisines'] + ','
df_combined['Cuisines'] = df_combined['Cuisines'].str.split(',', n = 1, expand = True)[0]

df_cuisines_dummies = pd.get_dummies(df_combined['Cuisines'])

In [85]:
df_combined = pd.concat([df_combined, df_location_dummies, df_cuisines_dummies], axis=1)

In [92]:
df_combined.drop(['Location', 'Cuisines'], axis = 'columns', inplace=True)

In [93]:
df_combined.head()

Unnamed: 0,Average_Cost,Delivery_Time,Minimum_Order,Rating,Restaurant,Reviews,Votes,check,"Babarpur, New Delhi, Delhi","Chandni Chowk, Kolkata",...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
4162,50.0,30.0,50,3.61346,3167,123.247893,244.544457,tr,0,0,...,0,0,0,0,0,0,0,0,0,0
10616,50.0,30.0,50,3.4,1025,5.0,7.0,tr,0,0,...,0,0,0,0,0,0,0,0,0,0
800,50.0,30.0,50,3.1,7271,10.0,98.0,tr,0,0,...,0,0,0,0,0,0,0,0,0,0
802,50.0,30.0,50,3.5,1769,5.0,23.0,tr,1,0,...,0,1,0,0,0,0,0,0,0,0
5781,50.0,30.0,50,3.61346,629,123.247893,244.544457,tr,0,0,...,0,1,0,0,0,0,0,0,0,0


In [97]:
df_train = df_combined[df_combined['check']=='tr']
df_test = df_combined[df_combined['check']=='ts']

df_train.drop(['check'], axis=1, inplace=True)
df_test.drop(['check', 'Delivery_Time'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [98]:
df_test.head()

Unnamed: 0,Average_Cost,Minimum_Order,Rating,Restaurant,Reviews,Votes,"Babarpur, New Delhi, Delhi","Chandni Chowk, Kolkata","Chatta Bazaar, Malakpet, Hyderabad","D-Block, Sector 63, Noida",...,Steak,Street Food,Sushi,Tamil,Tea,Thai,Tibetan,Turkish,Vietnamese,Wraps
0,350.0,50,4.2,2842,225.0,361.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,100.0,50,3.600044,730,111.410821,226.928315,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100.0,50,3.6,4620,16.0,36.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200.0,50,3.6,5470,33.0,66.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,150.0,50,2.9,3249,14.0,38.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Fitting Model

In [101]:
X = df_train.drop(["Delivery_Time"], axis=1)
y = df_train['Delivery_Time']

from sklearn.model_selection import train_test_split
x_train, x_test , y_train, y_test = train_test_split(X, y, test_size=0.2)

In [103]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [104]:
model.score(x_test, y_test)

0.15476201019616398

In [105]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [114]:
df_delivery_output = pd.Series(model.predict(df_test)).astype('int64')

In [115]:
df_delivery_output.head()

0    37
1    35
2    35
3    31
4    35
dtype: int64

In [116]:
df_delivery_output = df_delivery_output.astype('str') + " minutes"
df_delivery_output.head()

0    37 minutes
1    35 minutes
2    35 minutes
3    31 minutes
4    35 minutes
dtype: object

In [117]:
df_delivery_output.to_excel('my_submission.xlsx')

In [119]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [120]:
svr.score(x_test, y_test)

-0.19924100772279529

In [None]:
from sklearn.linear_model import Ridge
