# Libraries and versions

In [1]:
python_version = !python -V #version 3.8.8
import pandas as pd #version 1.2.4
import numpy as np #version 1.20.1
import seaborn as sns #version 0.11.1
import matplotlib as plt #version 3.3.4
import jebas
from random import choice

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
print(f'Python version - {python_version[0][7:]}')
print(f'Pandas version - {pd.__version__}')
print(f'Numpy version - {np.__version__}')
print(f'Seaborn version - {sns.__version__}')
print(f'Matplotlib version - {plt.__version__}')

Python version - 3.8.8
Pandas version - 1.2.4
Numpy version - 1.20.1
Seaborn version - 0.11.1
Matplotlib version - 3.3.4


# Configurations

## Pandas

In [3]:
#apresentará 500 colunas e linhas
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Seaborn

In [4]:
sns.set_palette('inferno')
sns.set_style('darkgrid')

def configuration(graphic_object, title=None, xlabel=None, ylabel=None, colors=None):
    graphic_object.figure.set_size_inches(20,6)
    graphic_object.set_title(title, color=colors, fontsize=16)
    graphic_object.set_xlabel(xlabel, color=colors, fontsize=14)
    graphic_object.set_ylabel(ylabel, color=colors, fontsize=14)
    graphic_object = graphic_object

## Warnings

In [5]:
import warnings
warnings.filterwarnings("ignore")

## Dataset

In [6]:
gym = pd.read_csv('gym_model.csv')
gym.head()

Unnamed: 0,number_people,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,year,class_crowded_3
0,37,4,0,0,71.76,0,0,8,17,2015,above_mean
1,45,4,0,0,71.76,0,0,8,17,2015,above_mean
2,40,4,0,0,71.76,0,0,8,17,2015,above_mean
3,44,4,0,0,71.76,0,0,8,17,2015,above_mean
4,45,4,0,0,71.76,0,0,8,17,2015,above_mean


In [7]:
#Hour
print('Results with columns %')
pd.crosstab(index=gym['class_crowded_3'], columns=gym['hour'], normalize='columns', margins=True, margins_name='total').round(3)

Results with columns %


hour,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,total
class_crowded_3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
above_mean,0.145,0.002,0.0,0.0,0.0,0.0,0.014,0.145,0.307,0.423,0.421,0.397,0.433,0.421,0.425,0.443,0.458,0.433,0.438,0.433,0.408,0.4,0.395,0.222,0.306
bellow_mean,0.428,0.127,0.0,0.0,0.0,0.004,0.538,0.523,0.551,0.432,0.365,0.382,0.374,0.414,0.426,0.336,0.224,0.164,0.14,0.148,0.197,0.191,0.231,0.286,0.291
crowded,0.029,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.033,0.08,0.154,0.207,0.183,0.155,0.132,0.204,0.304,0.382,0.401,0.359,0.316,0.328,0.292,0.182,0.17
empty,0.399,0.872,1.0,1.0,1.0,0.996,0.448,0.33,0.108,0.066,0.06,0.014,0.011,0.01,0.016,0.018,0.014,0.022,0.021,0.06,0.079,0.081,0.081,0.309,0.233


### Baseline

Baseline will be the random probability per hour for each class

Exemple: if hour = 17 then probability empty=0.02 / probability crowded=0.38 / probability above_mean=0.43 / probability bellow_mean=0.16

In [49]:
#Create an object with probabilities
probability_table = pd.crosstab(index=gym['class_crowded_3'], columns=gym['hour'], normalize='columns')

#create a random values for prediction
baseline = {}
for column in probability_table:
    probability_above_mean = probability_table[column].values[0]
    probability_bellow_mean = probability_table[column].values[1]
    probability_crowded = probability_table[column].values[2]
    probability_empty = probability_table[column].values[3]
    probability_per_hour = np.random.choice(4, 100, 
                                            p=[probability_above_mean,
                                               probability_bellow_mean,
                                               probability_crowded,
                                               probability_empty])
    baseline_per_hour = choice(probability_per_hour)
    dict_temp = {column:baseline_per_hour}
    baseline.update(dict_temp)

#labeled predicted values
dict_baseline = {}
for key, value in baseline.items():
    if value==0:
        dict_temp = {key:'above_mean'}
        dict_baseline.update(dict_temp)
    elif value==1:
        dict_temp = {key:'bellow_mean'}
        dict_baseline.update(dict_temp)
    elif value==2:
        dict_temp = {key:'crowded'}
        dict_baseline.update(dict_temp)
    else:
        dict_temp = {key:'empty'}
        dict_baseline.update(dict_temp)

#Output        
baseline = pd.Series(dict_baseline)

## Predict baseline

In [50]:
# X and Y
X = gym.drop(columns=['number_people', 'class_crowded_3'])
y = gym['class_crowded_3']

#split data in train and test
_, X_test, _, y_test = train_test_split(X,
                                        y,
                                        train_size=0.8,
                                        stratify=y,
                                        random_state=666)

predict_list = []
for hour in X_test['hour']:
    predict_list.append(baseline[hour])

dict_results = {
    'y_baseline':predict_list,
    'y_validation':y_test
}

results = pd.DataFrame(dict_results)
results['accuracy'] = results['y_baseline'] == results['y_validation']

results['accuracy'].value_counts(normalize=True)

False    0.541526
True     0.458474
Name: accuracy, dtype: float64

=============================================
- With only one variable was possible create a random prediction for classification with an accuracy almost 45%

- How much can a Decision Tree algorithm predict

=============================================