In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [5]:
df['day'] = le.fit_transform(df['day']) #returns array

In [6]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,2,Dinner,2
1,10.34,1.66,Male,No,2,Dinner,3
2,21.01,3.50,Male,No,2,Dinner,3
3,23.68,3.31,Male,No,2,Dinner,2
4,24.59,3.61,Female,No,2,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,1,Dinner,3
240,27.18,2.00,Female,Yes,1,Dinner,2
241,22.67,2.00,Male,Yes,1,Dinner,2
242,17.82,1.75,Male,No,1,Dinner,2


In [7]:
df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [8]:
# we are supposing that Dinner ans lunch are in order,
order = {'Dinner':2,'Lunch':1}

In [9]:
df['time'] = df['time'].map(order)

In [10]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,2,2,2
1,10.34,1.66,Male,No,2,2,3
2,21.01,3.50,Male,No,2,2,3
3,23.68,3.31,Male,No,2,2,2
4,24.59,3.61,Female,No,2,2,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,1,2,3
240,27.18,2.00,Female,Yes,1,2,2
241,22.67,2.00,Male,Yes,1,2,2
242,17.82,1.75,Male,No,1,2,2


In [11]:
# now in other columns we there is not a order and the columns have 2 or 3 classes so we are going to use onehotencoder

In [12]:
df_cleaned = pd.get_dummies(df,drop_first=True,)
df_cleaned #our data have only numerical values so,the data is ready for scaling

Unnamed: 0,total_bill,tip,day,size,sex_Female,smoker_No,time_2
0,16.99,1.01,2,2,1,1,1
1,10.34,1.66,2,3,0,1,1
2,21.01,3.50,2,3,0,1,1
3,23.68,3.31,2,2,0,1,1
4,24.59,3.61,2,4,1,1,1
...,...,...,...,...,...,...,...
239,29.03,5.92,1,3,0,1,1
240,27.18,2.00,1,2,1,0,1
241,22.67,2.00,1,2,0,0,1
242,17.82,1.75,1,2,0,1,1


In [13]:
# there are many techniques for scaling numerical data but two of them which are generlly used are StandardScaler and MinMaxScaler

In [14]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test = train_test_split(df_cleaned.drop(columns=['size'],axis=1),df_cleaned['size'],test_size=0.2,random_state=51)

In [17]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(195, 6) (195,) (49, 6) (49,)


In [18]:
st = StandardScaler() # new data have mean = 0,and std=1
# for minmaxscaler we need to do this and whole process is same
# st = MinMaxScaler() # new data range is 0 to 1

In [19]:
st.fit(x_train) # fit only train data

In [20]:
st.mean_

array([19.56738462,  3.02312821,  1.77435897,  0.35897436,  0.60512821,
        0.6974359 ])

In [21]:
# return array
x_train_st = st.transform(x_train)
x_test_st = st.transform(x_test)  

In [22]:
x_train[['total_bill', 'tip', 'day', 'sex_Female', 'smoker_No', 'time_2']] = x_train_st

In [23]:
x_train

Unnamed: 0,total_bill,tip,day,sex_Female,smoker_No,time_2
51,-1.065632,-0.315628,0.240204,1.336306,0.807801,0.658653
183,0.413809,2.593537,0.240204,-0.748331,-1.237929,0.658653
127,-0.579760,-0.763192,1.304745,1.336306,0.807801,-1.518251
221,-0.706109,0.340799,-1.888878,1.336306,-1.237929,-1.518251
79,-0.261588,-0.233575,1.304745,-0.748331,0.807801,-1.518251
...,...,...,...,...,...,...
197,2.704184,1.474627,1.304745,1.336306,-1.237929,-1.518251
201,-0.784216,-0.755733,1.304745,1.336306,-1.237929,-1.518251
224,-0.706109,-1.076487,-1.888878,-0.748331,-1.237929,-1.518251
229,0.293202,-0.106765,-0.824337,1.336306,-1.237929,0.658653


In [24]:
x_test[['total_bill', 'tip', 'day', 'sex_Female', 'smoker_No', 'time_2']] = x_test_st

In [25]:
x_test

Unnamed: 0,total_bill,tip,day,sex_Female,smoker_No,time_2
39,1.344202,1.474627,-0.824337,-0.748331,0.807801,0.658653
190,-0.445369,-1.136162,0.240204,-0.748331,-1.237929,0.658653
203,-0.363816,-0.390222,1.304745,1.336306,-1.237929,-1.518251
23,2.280338,3.399152,-0.824337,-0.748331,0.807801,0.658653
35,0.516037,0.430312,-0.824337,-0.748331,0.807801,0.658653
171,-0.431586,0.102098,-0.824337,-0.748331,-1.237929,0.658653
97,-0.865769,-1.136162,-1.888878,-0.748331,-1.237929,0.658653
45,-0.146725,-0.017252,0.240204,-0.748331,0.807801,0.658653
115,-0.259291,0.355718,0.240204,1.336306,0.807801,0.658653
71,-0.286858,-0.017252,-0.824337,1.336306,0.807801,0.658653
