Why colab? Free Access to GPUs and TPUs

ML training, especially deep learning, is resource-intensive.

Colab provides free access to NVIDIA GPUs and Google TPUs, which speeds up training drastically.

Without Colab, you’d need an expensive local GPU or cloud setup.
Like Google Docs, multiple people can edit and run notebooks together.

This is great for team projects or sharing ML experiments.
Colab is fast, free, collaborative, and ready-to-use, making it ideal for ML prototyping, learning, and small-to-medium scale experiments.

In [2]:
#Activate package
import numpy as np
import pandas as pd

In [None]:
# Title / ## Subtitle / ### Section
# **bold text**
# *italic text*

In [None]:
##Ctrl + enter run cell
# Ctrl + f9 run all
## Ctrl + MD delete cell
##Ctrl  MM chanage cel to markdown
## Ctrl MY change cell to code
## Ctrl MB add new cell below



# Start lesson

## Start lesson

### Start lesson

In [3]:
#extract data and give name
df = pd.read_csv("https://raw.githubusercontent.com/humayhasilli101/E-15---24-ML-/refs/heads/main/tips2.csv")

In [5]:
#check or heading dataset
df.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,Female,No,Sun,Dinner,2
1,1,10.34,1.66,Male,No,Sun,Dinner,3
2,2,21.01,3.5,Male,No,Sun,Dinner,3
3,3,23.68,3.31,Male,No,Sun,Dinner,2
4,4,24.59,,Female,No,Sun,Dinner,4


In [6]:
  #remove unnecessary column
df = df.drop("Unnamed: 0",  axis = 1)

In [7]:
#get information about classes of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         241 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [8]:
#get descriptive stats of datsaet
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,241.0,3.490415,4.665933,1.0,2.0,2.92,3.71,50.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [9]:
#check for categorical uniqueness
df.nunique()

Unnamed: 0,0
total_bill,229
tip,122
sex,2
smoker,2
day,4
time,2
size,6


In [11]:
# Check categories
#Categorical variable standardization
df['day'].value_counts()

Unnamed: 0_level_0,count
day,Unnamed: 1_level_1
Sat,87
Sun,76
Thur,62
Fri,19


In [12]:
#df['day'] = df['day'].replace('Sunday', 'Saturday')

Unnamed: 0,day
0,Sun
1,Sun
2,Sun
3,Sun
4,Sun
...,...
239,Sat
240,Sat
241,Sat
242,Sat


In [13]:
  #change data types
  df['sex'] = df['sex'].astype('category')
  df['smoker'] = df['smoker'].astype('category')
  df['day'] = df['day'].astype('category')
  df['time'] = df['time'].astype('category')
  df['size'] = df['size'].astype('int64')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         241 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [15]:
#  calculate the bill per person and stores it in a new column called "bill_for_1person".
df["bill_for_1person"] = df["total_bill"] / df["size"]
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_for_1person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,,Female,No,Sun,Dinner,4,6.1475


In [16]:
# drop this new column
df = df.drop("bill_for_1person" , axis = 1)


In [17]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,,Female,No,Sun,Dinner,4


In [19]:
#Highest tip amount
df.sort_values(by='tip', ascending = False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
179,34.63,50.0,Male,Yes,Sun,Dinner,2
110,14.00,42.0,Male,No,Sat,Dinner,2
24,19.82,36.0,Male,No,Sat,Dinner,2
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
67,3.07,1.0,Female,Yes,Sat,Dinner,1
236,12.60,1.0,Male,Yes,Sat,Dinner,2
4,24.59,,Female,No,Sun,Dinner,4
25,17.81,,Male,No,Sat,Dinner,4


In [None]:
# Only the first three rows of the DataFrame are returned.
df.iloc[0:3]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [None]:
#Only the first three rows are included.

#Only the first two columns (Name and Age) are included.
df.iloc[:3,:2]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5


In [None]:
# Select tips greater than 5.
df[df.tip > 5]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
24,19.82,36.0,Male,No,Sat,Dinner,2
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
85,34.83,5.17,Female,No,Thur,Lunch,4
88,24.71,5.85,Male,No,Thur,Lunch,2
110,14.0,42.0,Male,No,Sat,Dinner,2
116,29.93,5.07,Male,No,Sun,Dinner,4


In [None]:
df["tip"].mean()

np.float64(3.4904149377593363)

In [None]:
df["tip"].max()

50.0

In [None]:
df["tip"].std()

4.665933435777806

In [None]:
#check null values
df.isnull().sum()

Unnamed: 0,0
total_bill,0
tip,3
sex,0
smoker,0
day,0
time,0
size,0


In [22]:
# remove na values
df_1 = df.dropna()

In [23]:
df_1.isnull().sum()

Unnamed: 0,0
total_bill,0
tip,0
sex,0
smoker,0
day,0
time,0
size,0


In [25]:
## SHow rows contain na
df[df.isnull().any(axis=1)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
4,24.59,,Female,No,Sun,Dinner,4
25,17.81,,Male,No,Sat,Dinner,4
79,17.29,,Male,No,Thur,Lunch,2


In [27]:
#str.upper() Convert the first letter to uppercase.
#str.lower() Convert the first letter to lowercase
# str.title() Capitalize the first letter.
df_1['smoker'].str.upper()

Unnamed: 0,smoker
0,NO
1,NO
2,NO
3,NO
5,NO
...,...
239,NO
240,YES
241,YES
242,NO


In [28]:
df_1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4


In [29]:
df_1.select_dtypes(include=np.number).corr()

Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.247719,0.602282
tip,0.247719,1.0,0.084956
size,0.602282,0.084956,1.0


# Outlier problem
# IQR (Interquartile Range) = Q3 - Q1
# Lower outlier threshold: Q1 - 1.5 * IQR
# Upper outlier threshold: Q3 + 1.5 * IQR


In [31]:
df_tip = df_1["tip"]
df_tip.describe().T

Unnamed: 0,tip
count,241.0
mean,3.490415
std,4.665933
min,1.0
25%,2.0
50%,2.92
75%,3.71
max,50.0


In [32]:
Q1 = df_tip.quantile(0.25)
Q3 = df_tip.quantile(0.75)
IQR = Q3-Q1

In [33]:
below = Q1- 1.5*IQR
above = Q3 + 1.5*IQR

In [34]:
below

np.float64(-0.565)

In [35]:
df_1[df_1["tip"] > above]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
24,19.82,36.0,Male,No,Sat,Dinner,2
59,48.27,6.73,Male,No,Sat,Dinner,4
110,14.0,42.0,Male,No,Sat,Dinner,2
141,34.3,6.7,Male,No,Thur,Lunch,6
170,50.81,10.0,Male,Yes,Sat,Dinner,3
179,34.63,50.0,Male,Yes,Sun,Dinner,2
183,23.17,6.5,Male,Yes,Sun,Dinner,4
212,48.33,9.0,Male,No,Sat,Dinner,4
214,28.17,6.5,Female,Yes,Sat,Dinner,3


In [36]:
# Save clear data
clear_df = df_1[df_1["tip"] < above].dropna().reset_index()

In [37]:
 clear_df['tip'].describe()

Unnamed: 0,tip
count,231.0
mean,2.857922
std,1.144446
min,1.0
25%,2.0
50%,2.74
75%,3.5
max,6.0


In [38]:
## Filter with and
df_1[(df_1['tip'] < 5) & (df_1['total_bill'] > 20)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
15,21.58,3.92,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
230,24.01,2.00,Male,Yes,Sat,Dinner,4
237,32.83,1.17,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [39]:
## Filter with or
df_1[(df_1['tip'] < 5) | (df_1['total_bill'] > 20)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
