In [1]:
# standardized modules
import seaborn as sns
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [10]:
tips = sns.load_dataset('tips')

In [11]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
tips.drop_duplicates

<bound method DataFrame.drop_duplicates of      total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]>

In [13]:
tips.dropna()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [14]:
tips_dummy = pd.get_dummies(tips[['sex','smoker','day']])

In [15]:
tips_dummy.head()

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun
0,0,1,0,1,0,0,0,1
1,1,0,0,1,0,0,0,1
2,1,0,0,1,0,0,0,1
3,1,0,0,1,0,0,0,1
4,0,1,0,1,0,0,0,1


In [16]:
tips = pd.concat([tips, tips_dummy], axis=1)

In [17]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun
0,16.99,1.01,Female,No,Sun,Dinner,2,0,1,0,1,0,0,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0,0,1,0,0,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1,0,0,1,0,0,0,1


In [19]:
tips = tips.drop(columns=['sex','smoker','day'])

In [20]:
tips.head(3)

Unnamed: 0,total_bill,tip,time,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun
0,16.99,1.01,Dinner,2,0,1,0,1,0,0,0,1
1,10.34,1.66,Dinner,3,1,0,0,1,0,0,0,1
2,21.01,3.5,Dinner,3,1,0,0,1,0,0,0,1


In [21]:
#splitting data into train and test data sets. We will not look at test in depth
# train will be split again into two datasets, train and validate
train, test = train_test_split(tips, test_size = .2, random_state=7)

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 66 to 175
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  195 non-null    float64 
 1   tip         195 non-null    float64 
 2   time        195 non-null    category
 3   size        195 non-null    int64   
 4   sex_Male    195 non-null    uint8   
 5   sex_Female  195 non-null    uint8   
 6   smoker_Yes  195 non-null    uint8   
 7   smoker_No   195 non-null    uint8   
 8   day_Thur    195 non-null    uint8   
 9   day_Fri     195 non-null    uint8   
 10  day_Sat     195 non-null    uint8   
 11  day_Sun     195 non-null    uint8   
dtypes: category(1), float64(2), int64(1), uint8(8)
memory usage: 7.9 KB


In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49 entries, 80 to 158
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  49 non-null     float64 
 1   tip         49 non-null     float64 
 2   time        49 non-null     category
 3   size        49 non-null     int64   
 4   sex_Male    49 non-null     uint8   
 5   sex_Female  49 non-null     uint8   
 6   smoker_Yes  49 non-null     uint8   
 7   smoker_No   49 non-null     uint8   
 8   day_Thur    49 non-null     uint8   
 9   day_Fri     49 non-null     uint8   
 10  day_Sat     49 non-null     uint8   
 11  day_Sun     49 non-null     uint8   
dtypes: category(1), float64(2), int64(1), uint8(8)
memory usage: 2.1 KB


In [24]:
# train data set is split into train and validate datasets
train, validate = train_test_split(train, test_size=.3, random_state=7)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 113 to 176
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  136 non-null    float64 
 1   tip         136 non-null    float64 
 2   time        136 non-null    category
 3   size        136 non-null    int64   
 4   sex_Male    136 non-null    uint8   
 5   sex_Female  136 non-null    uint8   
 6   smoker_Yes  136 non-null    uint8   
 7   smoker_No   136 non-null    uint8   
 8   day_Thur    136 non-null    uint8   
 9   day_Fri     136 non-null    uint8   
 10  day_Sat     136 non-null    uint8   
 11  day_Sun     136 non-null    uint8   
dtypes: category(1), float64(2), int64(1), uint8(8)
memory usage: 5.6 KB


In [26]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59 entries, 67 to 91
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  59 non-null     float64 
 1   tip         59 non-null     float64 
 2   time        59 non-null     category
 3   size        59 non-null     int64   
 4   sex_Male    59 non-null     uint8   
 5   sex_Female  59 non-null     uint8   
 6   smoker_Yes  59 non-null     uint8   
 7   smoker_No   59 non-null     uint8   
 8   day_Thur    59 non-null     uint8   
 9   day_Fri     59 non-null     uint8   
 10  day_Sat     59 non-null     uint8   
 11  day_Sun     59 non-null     uint8   
dtypes: category(1), float64(2), int64(1), uint8(8)
memory usage: 2.5 KB


In [27]:
# verifying the shape of the dataframe
train.shape, validate.shape, test.shape

((136, 12), (59, 12), (49, 12))

In [28]:
''' splitting data into two groups further. One group is going to be used to guess 
the y outcome. The other group will know the y outcome and will be used to verify
the model we create '''
x_train = train.drop(columns=['tip'])
y_train = train.tip

x_validate = validate.drop(columns=['tip'])
y_validate = validate.tip

x_test = test.drop(columns=['tip'])
y_test = test.tip