# Dependencies

In [1]:
!pip install -q kaggle

In [2]:
import pandas as pd
import zipfile
import re
from google.colab import files

# Data Preparation

Using dataset: [Gym Exercise Dataset | Kaggle](https://www.kaggle.com/datasets/niharika41298/gym-exercise-data)

In [3]:
# Upload Kaggle API token
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d niharika41298/gym-exercise-data

# Extracting files from downloaded archive
local_zip = 'gym-exercise-data.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Reading data
data = pd.read_csv('megaGymDataset.csv')
data.head()

Saving kaggle.json to kaggle.json
Downloading gym-exercise-data.zip to /content
100% 120k/120k [00:00<00:00, 313kB/s]
100% 120k/120k [00:00<00:00, 313kB/s]


Unnamed: 0.1,Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating,RatingDesc
0,0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate,0.0,
1,1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate,,
2,2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate,,
3,3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate,,
4,4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate,,


# Exploratory Data Analysis

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2918 entries, 0 to 2917
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2918 non-null   int64  
 1   Title       2918 non-null   object 
 2   Desc        1368 non-null   object 
 3   Type        2918 non-null   object 
 4   BodyPart    2918 non-null   object 
 5   Equipment   2918 non-null   object 
 6   Level       2918 non-null   object 
 7   Rating      1031 non-null   float64
 8   RatingDesc  862 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 205.3+ KB


Duplicate index column must be dropped with other irrelevant features: Rating and RatingDesc.

In [5]:
data.isna().sum()

Unnamed: 0       0
Title            0
Desc          1550
Type             0
BodyPart         0
Equipment        0
Level            0
Rating        1887
RatingDesc    2056
dtype: int64

Rows with null descriptions are unusable and must be dropped.

In [6]:
data.value_counts('Type')

Type
Strength                 2545
Stretching                147
Plyometrics                97
Powerlifting               37
Cardio                     35
Olympic Weightlifting      35
Strongman                  22
dtype: int64

In [7]:
data.value_counts('BodyPart')

BodyPart
Abdominals     662
Quadriceps     646
Shoulders      340
Chest          262
Biceps         168
Triceps        151
Lats           124
Hamstrings     121
Middle Back    118
Lower Back      97
Glutes          81
Calves          47
Forearms        31
Traps           24
Abductors       21
Adductors       17
Neck             8
dtype: int64

In [8]:
data.value_counts('Equipment')

Equipment
Body Only        1078
Dumbbell          516
Barbell           282
Other             254
Cable             226
Machine           175
Kettlebells       149
Bands             100
Medicine Ball      38
Exercise Ball      35
None               32
E-Z Curl Bar       22
Foam Roll          11
dtype: int64

Body Only and None equipment should be the same.

In [9]:
data.value_counts('Level')

Level
Intermediate    2446
Beginner         459
Expert            13
dtype: int64

In [10]:
print('Duplicate rows:', data.duplicated().sum())
print('Duplicate title:', data.duplicated(subset='Title').sum())

Duplicate rows: 0
Duplicate title: 9


Duplicate rows must be dropped to avoid same item recommendation.

In [11]:
data.loc[data.duplicated(subset=['Title'])]

Unnamed: 0.1,Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating,RatingDesc
97,97,Decline bar press sit-up,The decline bar press sit-up is a weighted cor...,Strength,Abdominals,Barbell,Intermediate,8.5,Average
645,645,Exercise Ball Cable Crunch - Gethin Variation,The exercise ball crunch is a popular gym exer...,Strength,Abdominals,Cable,Intermediate,,
939,939,Band-suspended kettlebell bench press,The band-suspended kettlebell bench press is a...,Strength,Chest,Bands,Intermediate,,
958,958,Band-suspended kettlebell bench press,The band-suspended kettlebell bench press is a...,Strength,Chest,Bands,Intermediate,,
1709,1709,Seated Cable Rows,The cable seated row is a popular exercise to ...,Strength,Middle Back,Cable,Intermediate,8.8,Average
1730,1730,Seated Cable Rows,The cable seated row is a popular exercise to ...,Strength,Middle Back,Cable,Intermediate,8.8,Average
2004,2004,Dumbbell step-up,The dumbbell step-up is a great exercise for b...,Strength,Quadriceps,Dumbbell,Intermediate,8.2,Average
2655,2655,Arnold press,Named after the iconic bodybuilder and movie s...,Strength,Shoulders,Dumbbell,Intermediate,8.9,Average
2658,2658,Seated rear delt fly,The seated rear delt fly is an upper-body exer...,Strength,Shoulders,Dumbbell,Intermediate,8.4,Average


# Data Cleaning

In [12]:
# Dropping irrelevant columns
data = data.drop(['Unnamed: 0', 'Rating', 'RatingDesc'], axis=1)

# Dropping rows with blank description
data = data.dropna(subset=['Desc'])

# Replacing None with Body Only for Equipment column
data['Equipment'] = data['Equipment'].replace('None', 'Body Only')

# Normalizing Title text
data['Title'] = data['Title'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x).lower().strip())

# Dropping duplicate workout Title
data = data.drop_duplicates(subset='Title')

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1329 entries, 0 to 2916
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      1329 non-null   object
 1   Desc       1329 non-null   object
 2   Type       1329 non-null   object
 3   BodyPart   1329 non-null   object
 4   Equipment  1329 non-null   object
 5   Level      1329 non-null   object
dtypes: object(6)
memory usage: 72.7+ KB


In [14]:
data.isna().sum()

Title        0
Desc         0
Type         0
BodyPart     0
Equipment    0
Level        0
dtype: int64

In [15]:
data.value_counts('Type')

Type
Strength                 1197
Plyometrics                53
Stretching                 44
Cardio                     16
Powerlifting               11
Olympic Weightlifting       5
Strongman                   3
dtype: int64

In [16]:
data.value_counts('BodyPart')

BodyPart
Abdominals     293
Quadriceps     235
Shoulders      170
Chest          144
Biceps         100
Triceps         85
Lats            68
Hamstrings      49
Middle Back     45
Lower Back      40
Glutes          28
Calves          25
Forearms        16
Traps           16
Abductors        8
Adductors        7
dtype: int64

In [17]:
data.value_counts('Equipment')

Equipment
Body Only        404
Dumbbell         240
Barbell          157
Cable            146
Machine          114
Other             97
Kettlebells       50
Bands             47
Exercise Ball     28
Medicine Ball     25
E-Z Curl Bar      13
Foam Roll          8
dtype: int64

In [18]:
data.value_counts('Level')

Level
Intermediate    1212
Beginner         107
Expert            10
dtype: int64

In [19]:
print('Duplicate rows:', data.duplicated().sum())
print('Duplicate title:', data.duplicated(subset='Title').sum())

Duplicate rows: 0
Duplicate title: 0


## Restoring title before normalization

In [20]:
old_data = pd.read_csv('megaGymDataset.csv')
old_data.head()

Unnamed: 0.1,Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating,RatingDesc
0,0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate,0.0,
1,1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate,,
2,2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate,,
3,3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate,,
4,4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate,,


In [21]:
# Listing dropped indices
dropped_idx = set(old_data.index) - set(data.index)
print(dropped_idx)

{6, 12, 19, 20, 22, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 43, 44, 45, 46, 47, 48, 49, 50, 56, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 91, 97, 105, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 156, 159, 161, 162, 165, 168, 176, 178, 179, 183, 194, 202, 204, 208, 212, 217, 219, 220, 222, 224, 225, 226, 228, 229, 231, 233, 242, 260, 261, 262, 263, 264, 287, 298, 310, 311, 320, 323, 329, 330, 333, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 388, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 414, 421, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,

In [22]:
# Dropping indexes from old data
old_data = old_data.drop(['Unnamed: 0', 'Rating', 'RatingDesc'], axis=1)
old_data = old_data.drop(index=dropped_idx)
old_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1329 entries, 0 to 2916
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      1329 non-null   object
 1   Desc       1329 non-null   object
 2   Type       1329 non-null   object
 3   BodyPart   1329 non-null   object
 4   Equipment  1329 non-null   object
 5   Level      1329 non-null   object
dtypes: object(6)
memory usage: 72.7+ KB


In [23]:
# Replacing Title data
data['Title'] = old_data['Title']
data = data.reset_index(drop=True)
data

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level
0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate
1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate
2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate
3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate
4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate
...,...,...,...,...,...,...
1324,Bench dip,The bench dip is a highly effective exercise f...,Strength,Triceps,Body Only,Intermediate
1325,Decline EZ-bar skullcrusher,The decline EZ-bar skullcrusher is a popular e...,Strength,Triceps,E-Z Curl Bar,Intermediate
1326,EZ-Bar Skullcrusher,The EZ-bar skullcrusher is a popular exercise ...,Strength,Triceps,E-Z Curl Bar,Intermediate
1327,EZ-Bar Skullcrusher - Gethin Variation,The EZ-bar skullcrusher is a popular exercise ...,Strength,Triceps,E-Z Curl Bar,Intermediate


Data is now ready to use

In [24]:
# Saving data to csv
data.to_csv('workout_dataset.csv', index=True, header=True)