#  Aim:
##  To find the factors affecting waiter tips and train a machine learning model to predict the waiter’s tipping.

In [113]:
import pandas as pd
df = pd.read_csv('tips.csv')
# OR other Download weblink :
        # https://www.kaggle.com/jsphyg/tipping/version/1?select=tips.csv
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [114]:
# Explanation of the features
# 1) total_bill: Total bill in dollars including taxes
# 2) tip: Tip given to waiters in dollars
# 3) sex: gender of the person paying the bill
# 4) smoker: whether the person smoked or not
# 5) day: day of the week
# 6) time: lunch or dinner
# 7) size: number of people in a table 

## Data Analysis

In [115]:
# Let’s have a look at the tips given to the waiters according to:
# 1) the total bill paid
# 2) number of people at a table and
# 3) the day of the week

import plotly.express as px
fig1 = px.scatter(data_frame=df,
                  x='total_bill', # parameter 1) the total bill paid 
                  y='tip',        # parameter 0) tips
                  size='size',    # parameter 2) number of people at a table and
                  color='day',    # parameter 3) the day of the week
                  trendline='ols')
fig1
# Observation ??

In [116]:
#Now let’s have a look at the tips given to the waiters according to: 
    # the total bill paid
    # the number of people at a table
    # AND the gender of the person paying the bill:

import plotly.express as px
fig2 = px.scatter(data_frame=df,
                  x='total_bill',
                  y='tip',
                  size='size',
                  color='sex',     # parameter 3) the gender of the person paying the bill:
                  trendline='ols')
fig2
# Observation ??

In [117]:
#Now let’s have a look at the tips given to the waiters according to: 
    # the total bill paid
    # the number of people at a table
    # AND the time of the meal
    
import plotly.express as px
fig3 = px.scatter(data_frame=df,
                  x='total_bill',
                  y='tip',
                  size='size',
                  color='time',    # parameter 3) the time of the meal
                  trendline='ols')
fig3
# Observation ??

In [118]:
#Now let’s see the tips given to the waiters according to the days to find out
        # which day the most tips are given to the waiters:
import plotly.express as px
fig4 = px.pie(df,
             values='tip',
             names='day',
             hole=0.5 # hole as a percentage of whole
             )
fig4
# Observation ~ Maximum tips are given to the waiters.
                    # on Saturdays (35.6 %)
                    # on Sundays (33.8%)

In [119]:
# Now let’s see if a smoker tips more or a non-smoker:
import plotly.express as px
fig5 = px.pie(df,
             values='tip',
             names='smoker',
             hole=0.5
             )
fig5
# Observation ~ non-smoker tips waiters more than smokers.

In [120]:
# Now let’s see if most tips are given during lunch or dinner:
import plotly.express as px
fig5 = px.pie(df,
             values='tip',
             names='time',
             hole=0.5
             )
fig5
# Observation ~ a waiter is tipped more during dinner.

# Data Pre-processing

In [149]:
df = pd.read_csv('tips.csv')
#Download weblink :
        # https://www.kaggle.com/jsphyg/tipping/version/1?select=tips.csv
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [150]:
df['sex'].unique()

array(['Female', 'Male'], dtype=object)

In [151]:
df['smoker'].unique()

array(['No', 'Yes'], dtype=object)

In [152]:
df['day'].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [153]:
df['time'].unique()

array(['Dinner', 'Lunch'], dtype=object)

In [154]:
# transforming the categorical values into numerical values:
        # if "all"categorical features
df['sex'] = df['sex'].map({'Female':1,'Male':0})
df['smoker'] = df['smoker'].map({'No':1,'Yes':0})
df['day'] = df['day'].map({'Sun':1, 'Sat':2,'Thur':3, 'Fri':4})
df['time'] = df['time'].map({'Dinner':1,'Lunch':0})
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,1,1,1,1,2
1,10.34,1.66,0,1,1,1,3
2,21.01,3.50,0,1,1,1,3
3,23.68,3.31,0,1,1,1,2
4,24.59,3.61,1,1,1,1,4
...,...,...,...,...,...,...,...
239,29.03,5.92,0,1,2,1,3
240,27.18,2.00,1,0,2,1,2
241,22.67,2.00,0,0,2,1,2
242,17.82,1.75,0,1,2,1,2


In [165]:
# splitting dataset / columns / features into independent (Xi) and dependent(y) variables
import numpy as np
X = df.drop(columns='tip') 
X

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,1,1,1,1,2
1,10.34,0,1,1,1,3
2,21.01,0,1,1,1,3
3,23.68,0,1,1,1,2
4,24.59,1,1,1,1,4
...,...,...,...,...,...,...
239,29.03,0,1,2,1,3
240,27.18,1,0,2,1,2
241,22.67,0,0,2,1,2
242,17.82,0,1,2,1,2


In [164]:
y = df['tip']
y

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [168]:
# splitting the data into training and test sets:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2, #20% is reserved for "testing of TRAINING dataset"
                                                 random_state=355) 


# Training & Evaluating the Linear Regression (ML) model

In [169]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [172]:
# calculating the Accuracy
model.score(X_test,y_test)

0.3785318092628768

In [175]:
X_test.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
96,27.28,0,0,4,1,2
132,11.17,1,1,3,0,2
121,13.42,1,1,3,0,2
143,27.05,1,1,3,0,6
107,25.21,0,0,2,1,2


In [178]:
# Now let’s test the performance of this model 
    #by giving inputs to this model 
    # according to the features that we have used to train this model:

features = np.array([[27.28, 0, 0, 4, 1,2]])
model.predict(features)

array([3.62570096])

In [179]:
# 3.62 is close to 4.00

y_test.head()

96     4.00
132    1.50
121    1.68
143    5.00
107    4.29
Name: tip, dtype: float64