# Machine Learning Landscape

In [13]:
import pandas as pd

In [14]:
df_bikes = pd.read_csv('../data/bike_rentals.csv')
df_bikes.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


## 1. Exploratory Data Analysis

In [15]:
df_bikes.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,731.0,366.0,211.165812,1.0,183.5,366.0,548.5,731.0
season,731.0,2.49658,1.110807,1.0,2.0,3.0,3.0,4.0
yr,730.0,0.5,0.500343,0.0,0.0,0.5,1.0,1.0
mnth,730.0,6.512329,3.448303,1.0,4.0,7.0,9.75,12.0
holiday,731.0,0.028728,0.167155,0.0,0.0,0.0,0.0,1.0
weekday,731.0,2.997264,2.004787,0.0,1.0,3.0,5.0,6.0
workingday,731.0,0.682627,0.465773,0.0,0.0,1.0,1.0,1.0
weathersit,731.0,1.395349,0.544894,1.0,1.0,1.0,2.0,3.0
temp,730.0,0.495587,0.183094,0.05913,0.336875,0.499166,0.655625,0.861667
atemp,730.0,0.474512,0.163017,0.07907,0.337794,0.487364,0.608916,0.840896


In [16]:
df_bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    float64
 3   yr          730 non-null    float64
 4   mnth        730 non-null    float64
 5   holiday     731 non-null    float64
 6   weekday     731 non-null    float64
 7   workingday  731 non-null    float64
 8   weathersit  731 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         728 non-null    float64
 12  windspeed   726 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(10), int64(5), object(1)
memory usage: 91.5+ KB


## 2. Correcting null values

In [17]:
df_bikes.isna().sum()

instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     5
casual        0
registered    0
cnt           0
dtype: int64

In [18]:
# total of nulls
df_bikes.isna().sum().sum()

12

In [19]:
# displaying null values
df_bikes[df_bikes.isna().any(axis=1)]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
56,57,2011-02-26,1.0,0.0,2.0,0.0,6.0,0.0,1,0.2825,0.282192,0.537917,,424,1545,1969
81,82,2011-03-23,2.0,0.0,3.0,0.0,3.0,1.0,2,0.346957,0.337939,0.839565,,203,1918,2121
128,129,2011-05-09,2.0,0.0,5.0,0.0,1.0,1.0,1,0.5325,0.525246,0.58875,,664,3698,4362
129,130,2011-05-10,2.0,0.0,5.0,0.0,2.0,1.0,1,0.5325,0.522721,,0.115671,694,4109,4803
213,214,2011-08-02,3.0,0.0,8.0,0.0,2.0,1.0,1,0.783333,0.707071,,0.20585,801,4044,4845
298,299,2011-10-26,4.0,0.0,10.0,0.0,3.0,1.0,2,0.484167,0.472846,0.720417,,404,3490,3894
388,389,2012-01-24,1.0,1.0,1.0,0.0,2.0,1.0,1,0.3425,0.349108,,0.123767,439,3900,4339
528,529,2012-06-12,2.0,1.0,6.0,0.0,2.0,1.0,2,0.653333,0.597875,0.833333,,477,4495,4972
701,702,2012-12-02,4.0,1.0,12.0,0.0,0.0,0.0,2,,,0.823333,0.124379,892,3757,4649
730,731,2012-12-31,1.0,,,0.0,1.0,0.0,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


## 3. Correcting null values: replacing with median/mean

In [20]:
# for windspeed column
df_bikes.windspeed.fillna(df_bikes.windspeed.median(), inplace=True)

In [21]:
df_bikes.iloc[[56, 81, 128]]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
56,57,2011-02-26,1.0,0.0,2.0,0.0,6.0,0.0,1,0.2825,0.282192,0.537917,0.180971,424,1545,1969
81,82,2011-03-23,2.0,0.0,3.0,0.0,3.0,1.0,2,0.346957,0.337939,0.839565,0.180971,203,1918,2121
128,129,2011-05-09,2.0,0.0,5.0,0.0,1.0,1.0,1,0.5325,0.525246,0.58875,0.180971,664,3698,4362


## 4. Group by median/mean

In [22]:
# group by season
df_bikes.groupby(['season']).median()

TypeError: agg function failed [how->median,dtype->object]

In [23]:
df_bikes.isna().sum()

instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [24]:
df_bikes.season.dtypes

dtype('float64')

In [25]:
df_bikes.dtypes

instant         int64
dteday         object
season        float64
yr            float64
mnth          float64
holiday       float64
weekday       float64
workingday    float64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [26]:
df_bikes_2 = df_bikes.drop('dteday', axis=1)
df_bikes_2.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [27]:
df_bikes_2.groupby(['season']).median()

Unnamed: 0_level_0,instant,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,366.0,0.5,2.0,0.0,3.0,1.0,1.0,0.285833,0.282821,0.54375,0.20275,218.0,1867.0,2209.0
2.0,308.5,0.5,5.0,0.0,3.0,1.0,1.0,0.562083,0.538212,0.646667,0.191546,867.0,3844.0,4941.5
3.0,401.5,0.5,8.0,0.0,3.0,1.0,1.0,0.714583,0.656575,0.635833,0.165115,1050.5,4110.5,5353.5
4.0,493.0,0.5,11.0,0.0,3.0,1.0,1.0,0.41,0.409708,0.661042,0.167918,544.5,3815.0,4634.5


In [28]:
df_bikes['hum'] = df_bikes['hum'].fillna(df_bikes_2.groupby('season')['hum'].transform('median')) # fill nulls with median of season
df_bikes.iloc[[129, 213, 388]]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
129,130,2011-05-10,2.0,0.0,5.0,0.0,2.0,1.0,1,0.5325,0.522721,0.646667,0.115671,694,4109,4803
213,214,2011-08-02,3.0,0.0,8.0,0.0,2.0,1.0,1,0.783333,0.707071,0.635833,0.20585,801,4044,4845
388,389,2012-01-24,1.0,1.0,1.0,0.0,2.0,1.0,1,0.3425,0.349108,0.54375,0.123767,439,3900,4339


## 5. Obtaining the median/mean from specific rows

In [31]:
# find null values of temp
df_bikes[df_bikes['temp'].isna()]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
701,702,2012-12-02,4.0,1.0,12.0,0.0,0.0,0.0,2,,,0.823333,0.124379,892,3757,4649


In [32]:
mean_temp = (df_bikes.iloc[700]['temp'] + df_bikes.iloc[702]['temp'])/2
mean_atemp = (df_bikes.iloc[700]['atemp'] + df_bikes.iloc[702]['atemp'])/2

In [35]:
df_bikes['temp'].fillna(mean_temp,  inplace=True) # fill nulls with mean of temp
df_bikes['atemp'].fillna(mean_atemp,  inplace=True) # fill nulls with mean of atemp

In [36]:
df_bikes.iloc[701]

instant              702
dteday        2012-12-02
season               4.0
yr                   1.0
mnth                12.0
holiday              0.0
weekday              0.0
workingday           0.0
weathersit             2
temp            0.375417
atemp            0.38635
hum             0.823333
windspeed       0.124379
casual               892
registered          3757
cnt                 4649
Name: 701, dtype: object

In [38]:
df_bikes['dteday'] = pd.to_datetime(df_bikes['dteday'], infer_datetime_format=True) # convert dteday to datetime

  df_bikes['dteday'] = pd.to_datetime(df_bikes['dteday'], infer_datetime_format=True) # convert dteday to datetime


In [39]:
import datetime as dt

In [40]:
df_bikes['mnth'] = df_bikes['dteday'].dt.month # extract month from dteday
df_bikes.head(2)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1,0.0,6.0,0.0,2,0.375417,0.38635,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1,0.0,0.0,0.0,2,0.375417,0.38635,0.696087,0.248539,131,670,801


In [41]:
df_bikes.yr.isna().sum()

1

In [42]:
df_bikes.iloc[730]

instant                       731
dteday        2012-12-31 00:00:00
season                        1.0
yr                            NaN
mnth                           12
holiday                       0.0
weekday                       1.0
workingday                    0.0
weathersit                      2
temp                     0.375417
atemp                     0.38635
hum                        0.5775
windspeed                0.154846
casual                        439
registered                   2290
cnt                          2729
Name: 730, dtype: object

In [43]:
df_bikes.loc[730, 'yr'] = 1 # replace nulls with 1 due to normalization

## 6. Deleting non-numerical  columns

In [44]:
# dteday is redundant now
df_bikes = df_bikes.drop('dteday', axis=1)

## 7. Predicting regression

In [45]:
# dropping redundant columns
df_bikes = df_bikes.drop(['casual', 'registered'], axis=1) # drop casual and registered columns
df_bikes.head(2)

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1,0.0,6.0,0.0,2,0.375417,0.38635,0.805833,0.160446,985
1,2,1.0,0.0,1,0.0,0.0,0.0,2,0.375417,0.38635,0.696087,0.248539,801


In [46]:
# saving dataframe
df_bikes.to_csv('../data/bike_rentals_clean.csv', index=False) # index=False to not save index as column

### 7.1 Declaring predictor and target columns

In [47]:
X = df_bikes.iloc[:, :-1] # independent variables
y = df_bikes.iloc[:, -1] # dependent variable

### 7.2 Accesing sklearn

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2105)

### 7.3 Silencing Warnings

In [52]:
import warnings
warnings.filterwarnings('ignore') # ignore warnings

### 7.4  Modeling linear regression

In [53]:
lin_reg = LinearRegression() # create linear regression object

In [54]:
lin_reg.fit(X_train, y_train) # fit model

In [55]:
# make predictions
y_pred = lin_reg.predict(X_test)

In [56]:
# evaluate model
from sklearn.metrics import mean_squared_error
import numpy as np

In [59]:
mse =  mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # root mean squared error
# print rmse with 2 digits
print('Root Mean Squared Error: {:.2f}'.format(rmse))

Root Mean Squared Error: 1212.33


In [60]:
df_bikes.cnt.describe()

count     731.000000
mean     4504.348837
std      1937.211452
min        22.000000
25%      3152.000000
50%      4548.000000
75%      5956.000000
max      8714.000000
Name: cnt, dtype: float64

## 8. XGBoost

In [63]:
from xgboost import XGBRegressor

In [65]:
xg_reg = XGBRegressor() # create xgboost regressor object

In [66]:
xg_reg.fit(X_train, y_train) # fit model


In [67]:
# make predictions
y_pred = xg_reg.predict(X_test)

In [68]:
# compare predictions
mse =  mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # root mean squared error

In [69]:
print( 'Root Mean Squared Error: {:.2f}'.format(rmse))

Root Mean Squared Error: 765.45


## 9. Cross Validation with linear regression

In [70]:
from sklearn.model_selection import cross_val_score

In [71]:
model = LinearRegression() # create linear regression object

In [72]:
scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error') # cross validation

In [74]:
rmse = np.sqrt(-scores) # root mean squared error
print('Reg rmse: ', np.round(rmse, 2)) # print rmse with 2 digits
print('Root Mean Squared Error: {:.2f}'.format(rmse.mean()))  # mean of rmse

Reg rmse:  [1626.7  1174.46 1343.68  883.43 1503.59 1952.09 1388.21 1385.96 1282.32
 2490.98]
Root Mean Squared Error: 1503.14


## 10. Cross  Validation with XGBoost

In [75]:
model =  XGBRegressor() # create xgboost regressor object

In [76]:
scores =  cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error') # cross validation
rmse = np.sqrt(-scores) # root mean squared error

In [78]:
print('Reg rmse: ', np.round(rmse, 2)) # print rmse with 2 digits
print( 'Root Mean Squared Error: {:.2f}'.format(rmse.mean())) # mean of rmse

Reg rmse:  [1149.14  666.12  613.65  736.32 1403.32 1638.44 1042.44  964.85  933.24
 2331.83]
Root Mean Squared Error: 1147.94


## 11. Classification Dataset

In [81]:
df_census = pd.read_csv( '../data/census_income/adult.data', header=None) # read census data
df_census.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [82]:
df_census.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'] # rename columns

In [83]:
df_census.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [84]:
# null values
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [85]:
df_census.isna().sum() # check for null values

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [86]:
# drop education column since is the categorical value of education_num
df_census = df_census.drop('education', axis=1)

In [87]:
df_census = pd.get_dummies(df_census)
df_census.head(2)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ <=50K,income_ >50K
0,39,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


### 11.1 Target and Predictor columns

In [88]:
# drop one column of income
df_census = df_census.drop('income_ <=50K', axis=1)
df_census.head(2)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [89]:
# split data into train and test
X = df_census.iloc[:, :-1] # independent variables
y = df_census.iloc[:, -1] # dependent variable

In [90]:
from sklearn.linear_model import LogisticRegression

In [91]:
def cross_val(classifier, num_splits=10):
    model =  classifier # create an instance of the  classifier
    scores =  cross_val_score(model, X, y, cv=num_splits, scoring='accuracy') # cross validation
    print('Accuracy ',  np.round(scores, 2)) # print accuracy with 2 digits
    print('Accuracy mean: {:.2f}'.format(scores.mean())) # mean of accuracy

In [92]:
cross_val(LogisticRegression()) # logistic regression

Accuracy  [0.8  0.8  0.79 0.8  0.79 0.81 0.79 0.8  0.8  0.8 ]
Accuracy mean: 0.80


### 11.2 XGBoost Classifier

In [93]:
from xgboost import XGBClassifier

In [94]:
# n_estimators = 5 because we need a short execution, even xgboost is fast
cross_val(XGBClassifier(n_estimators=5)) # xgboost

Accuracy  [0.85 0.86 0.87 0.85 0.86 0.86 0.86 0.87 0.86 0.86]
Accuracy mean: 0.86
