In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

Цели на проекта
- да видим установим кои са най-ефективните тренировки за изгаряне на калории
- да видим има ли зависимост в посещаемостта в различни дни от седмицата?!?
- да разгледаме предпочитаните упражнения и посещаемост от възрастови групи и полове
- да разгледаме финансовия оборот на различните обекти и да предвидим какви ще бъдат за следващ период


Допускам че:
- по-кратките тренировки са по-ефективни за изгаряне на калории, тоест имат повече изгорени калории за единица време.
- мъжете предпочитат един вид тренировки, а жените друг


Търся начин да 
- Установя половете на тези, които не са си ги попълнили, според предпочитаните тренировки. тест с реални данни, оставени извън модела на обучаване(които не е виждал)! Изследвам кои параметри са по-показателни за пола от други





# Data loading and preparation

In [2]:
#Data loading
users_data = pd.read_csv('data/users_data.csv')
gym_locations = pd.read_csv('data/gym_locations_data.csv')
checkin_checkout_history = pd.read_csv('data/checkin_checkout_history_updated.csv')
subscription_plans = pd.read_csv('data/subscription_plans.csv')

In [3]:
#Data types check
print(users_data.info(), "\n")
print(gym_locations.info(), "\n")
print(checkin_checkout_history.info(), "\n")
print(subscription_plans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            5000 non-null   object
 1   first_name         5000 non-null   object
 2   last_name          5000 non-null   object
 3   age                5000 non-null   int64 
 4   gender             5000 non-null   object
 5   birthdate          5000 non-null   object
 6   sign_up_date       5000 non-null   object
 7   user_location      5000 non-null   object
 8   subscription_plan  5000 non-null   object
dtypes: int64(1), object(8)
memory usage: 351.7+ KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gym_id      10 non-null     object
 1   location    10 non-null     object
 2   gym_type    10 non-null     object
 3   facilities  

In [4]:
# Merge data
full_data = checkin_checkout_history.merge(gym_locations, on='gym_id', how='left')
full_data = full_data.merge(users_data, on='user_id', how='left')
full_data = full_data.merge(subscription_plans, on='subscription_plan', how='left')

In [5]:
# Edit data

full_data['checkin_time'] = pd.to_datetime(full_data['checkin_time'])   # checkin_time data type
full_data['checkout_time'] = pd.to_datetime(full_data['checkout_time']) # checkout_time data type
full_data['duration_minutes'] = ((full_data['checkout_time'] - full_data['checkin_time']).dt.total_seconds() / 60) #add duration_minutes

full_data['full_name'] = full_data['first_name'] + ' ' + full_data['last_name'].astype(str) # add column full_name

full_data['day_of_week'] = full_data['checkin_time'].dt.day_name()
full_data['week_number'] = full_data['checkin_time'].dt.isocalendar().week

full_data.drop(['birthdate', 'features', 'first_name', 'last_name'], axis=1, inplace=True) # remove useless columns


#reorder columns

In [6]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   user_id            300000 non-null  object        
 1   gym_id             300000 non-null  object        
 2   checkin_time       300000 non-null  datetime64[ns]
 3   checkout_time      300000 non-null  datetime64[ns]
 4   workout_type       300000 non-null  object        
 5   calories_burned    300000 non-null  int64         
 6   location           300000 non-null  object        
 7   gym_type           300000 non-null  object        
 8   facilities         300000 non-null  object        
 9   age                300000 non-null  int64         
 10  gender             300000 non-null  object        
 11  sign_up_date       300000 non-null  object        
 12  user_location      300000 non-null  object        
 13  subscription_plan  300000 non-null  object  

In [7]:
full_data.head(3)

Unnamed: 0,user_id,gym_id,checkin_time,checkout_time,workout_type,calories_burned,location,gym_type,facilities,age,gender,sign_up_date,user_location,subscription_plan,price_per_month,duration_minutes,full_name,day_of_week,week_number
0,user_3291,gym_6,2023-09-10 15:55:00,2023-09-10 16:34:00,Weightlifting,462,Philadelphia,Budget,"Swimming Pool, Climbing Wall, Sauna",41,Female,2023-04-08,Atlanta,Pro,49.99,39.0,Michael Rodriguez,Sunday,36
1,user_1944,gym_2,2023-04-13 20:07:00,2023-04-13 22:43:00,Yoga,1278,Los Angeles,Budget,"Climbing Wall, Yoga Classes, Sauna",24,Female,2023-07-16,Las Vegas,Pro,49.99,156.0,Michael Garcia,Thursday,15
2,user_958,gym_7,2023-06-10 12:24:00,2023-06-10 13:49:00,Cardio,858,San Antonio,Premium,"Sauna, Basketball Court, Swimming Pool",37,Male,2021-09-15,Boston,Basic,19.99,85.0,Emily Rodriguez,Saturday,23


In [8]:
# # Add duration of workout in minutes to the dataset
# full_data['checkin_time'] = pd.to_datetime(full_data['checkin_time'])
# full_data['checkout_time'] = pd.to_datetime(full_data['checkout_time'])
# full_data['duration_minutes'] = (full_data['checkout_time'] - full_data['checkin_time']).dt.total_seconds() / 60

In [9]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   user_id            300000 non-null  object        
 1   gym_id             300000 non-null  object        
 2   checkin_time       300000 non-null  datetime64[ns]
 3   checkout_time      300000 non-null  datetime64[ns]
 4   workout_type       300000 non-null  object        
 5   calories_burned    300000 non-null  int64         
 6   location           300000 non-null  object        
 7   gym_type           300000 non-null  object        
 8   facilities         300000 non-null  object        
 9   age                300000 non-null  int64         
 10  gender             300000 non-null  object        
 11  sign_up_date       300000 non-null  object        
 12  user_location      300000 non-null  object        
 13  subscription_plan  300000 non-null  object  

In [10]:
full_data.head()

Unnamed: 0,user_id,gym_id,checkin_time,checkout_time,workout_type,calories_burned,location,gym_type,facilities,age,gender,sign_up_date,user_location,subscription_plan,price_per_month,duration_minutes,full_name,day_of_week,week_number
0,user_3291,gym_6,2023-09-10 15:55:00,2023-09-10 16:34:00,Weightlifting,462,Philadelphia,Budget,"Swimming Pool, Climbing Wall, Sauna",41,Female,2023-04-08,Atlanta,Pro,49.99,39.0,Michael Rodriguez,Sunday,36
1,user_1944,gym_2,2023-04-13 20:07:00,2023-04-13 22:43:00,Yoga,1278,Los Angeles,Budget,"Climbing Wall, Yoga Classes, Sauna",24,Female,2023-07-16,Las Vegas,Pro,49.99,156.0,Michael Garcia,Thursday,15
2,user_958,gym_7,2023-06-10 12:24:00,2023-06-10 13:49:00,Cardio,858,San Antonio,Premium,"Sauna, Basketball Court, Swimming Pool",37,Male,2021-09-15,Boston,Basic,19.99,85.0,Emily Rodriguez,Saturday,23
3,user_811,gym_2,2023-05-23 17:11:00,2023-05-23 20:01:00,Yoga,1134,Los Angeles,Budget,"Climbing Wall, Yoga Classes, Sauna",34,Female,2023-04-25,Las Vegas,Student,9.99,170.0,David Miller,Tuesday,21
4,user_4923,gym_10,2023-02-21 06:20:00,2023-02-21 08:02:00,Weightlifting,1049,San Jose,Premium,"Swimming Pool, Sauna, CrossFit",32,Female,2022-12-29,Austin,Pro,49.99,102.0,Michael Johnson,Tuesday,8


In [11]:
# new_column_order = [
#     'user_id', 
#     'full_name', 
#     'gender', 
#     'age', 
#     'user_location', 
#     'sign_up_date', 
#     'subscription_plan', 
#     'price_per_month', 
#     'gym_id', 
#     'location', 
#     'gym_type', 
#     'facilities', 
#     'checkin_time', 
#     'checkout_time', 
#     'duration_minutes', 
#     'workout_type', 
#     'calories_burned', 
#     'day_of_week', 
#     'week_number' 
# ]

# full_data = full_data[new_column_order]


new_column_order = ['user_id', 'full_name', 'gender', 'age', 'user_location', 'sign_up_date',
                    'subscription_plan', 'price_per_month', 'gym_id', 'location', 'gym_type', 'facilities',
                    'checkin_time', 'checkout_time', 'duration_minutes', 'workout_type', 'calories_burned', 
                    'day_of_week', 'week_number']
full_data = full_data[new_column_order]


In [12]:
full_data

Unnamed: 0,user_id,full_name,gender,age,user_location,sign_up_date,subscription_plan,price_per_month,gym_id,location,gym_type,facilities,checkin_time,checkout_time,duration_minutes,workout_type,calories_burned,day_of_week,week_number
0,user_3291,Michael Rodriguez,Female,41,Atlanta,2023-04-08,Pro,49.99,gym_6,Philadelphia,Budget,"Swimming Pool, Climbing Wall, Sauna",2023-09-10 15:55:00,2023-09-10 16:34:00,39.0,Weightlifting,462,Sunday,36
1,user_1944,Michael Garcia,Female,24,Las Vegas,2023-07-16,Pro,49.99,gym_2,Los Angeles,Budget,"Climbing Wall, Yoga Classes, Sauna",2023-04-13 20:07:00,2023-04-13 22:43:00,156.0,Yoga,1278,Thursday,15
2,user_958,Emily Rodriguez,Male,37,Boston,2021-09-15,Basic,19.99,gym_7,San Antonio,Premium,"Sauna, Basketball Court, Swimming Pool",2023-06-10 12:24:00,2023-06-10 13:49:00,85.0,Cardio,858,Saturday,23
3,user_811,David Miller,Female,34,Las Vegas,2023-04-25,Student,9.99,gym_2,Los Angeles,Budget,"Climbing Wall, Yoga Classes, Sauna",2023-05-23 17:11:00,2023-05-23 20:01:00,170.0,Yoga,1134,Tuesday,21
4,user_4923,Michael Johnson,Female,32,Austin,2022-12-29,Pro,49.99,gym_10,San Jose,Premium,"Swimming Pool, Sauna, CrossFit",2023-02-21 06:20:00,2023-02-21 08:02:00,102.0,Weightlifting,1049,Tuesday,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,user_3995,Jessica Smith,Female,21,Seattle,2021-08-02,Pro,49.99,gym_3,Chicago,Budget,"Sauna, Climbing Wall, Swimming Pool",2023-08-06 17:25:00,2023-08-06 18:09:00,44.0,Pilates,288,Sunday,31
299996,user_206,Sarah Rodriguez,Female,19,Detroit,2022-03-01,Student,9.99,gym_9,Dallas,Premium,"Sauna, CrossFit, Yoga Classes",2023-06-27 13:14:00,2023-06-27 16:04:00,170.0,Weightlifting,1935,Tuesday,26
299997,user_4983,Michael Moore,Male,52,Denver,2022-04-13,Basic,19.99,gym_4,Houston,Premium,"Climbing Wall, Basketball Court, Swimming Pool",2023-04-08 14:41:00,2023-04-08 15:54:00,73.0,Cardio,1312,Saturday,14
299998,user_1028,Sarah Johnson,Male,39,Atlanta,2022-03-15,Pro,49.99,gym_10,San Jose,Premium,"Swimming Pool, Sauna, CrossFit",2023-03-05 06:07:00,2023-03-05 07:04:00,57.0,Cardio,787,Sunday,9
