# Bikesharing

## 1. Input

### Importing libraries

In [1]:
import pandas as pd

from pandas_profiling import ProfileReport

%matplotlib inline

### Read input files

In [2]:
df = pd.read_csv('./bikesharing_modified.txt', delimiter=',')
df.head()

Unnamed: 0,dteday,season,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1/1/2011,spring,0,0,Saturday,0,Clear,0.24,0.2879,0.81,0.0,16
1,1/1/2011,spring,1,0,Saturday,0,Clear,0.22,0.2727,0.8,0.0,40
2,1/1/2011,spring,2,0,Saturday,0,Clear,0.22,0.2727,0.8,0.0,32
3,1/1/2011,spring,3,0,Saturday,0,Clear,0.24,0.2879,0.75,0.0,13
4,1/1/2011,spring,4,0,Saturday,0,Clear,0.24,0.2879,0.75,0.0,1


## 2. EDA - Exploratory Data Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      17379 non-null  object 
 1   season      17379 non-null  object 
 2   hr          17379 non-null  int64  
 3   holiday     17379 non-null  int64  
 4   weekday     17379 non-null  object 
 5   workingday  17379 non-null  int64  
 6   weathersit  17379 non-null  object 
 7   temp        17369 non-null  float64
 8   atemp       17379 non-null  float64
 9   hum         17366 non-null  float64
 10  windspeed   17379 non-null  float64
 11  cnt         17379 non-null  int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 1.6+ MB


### Using Pandas Profiling to EDA

In [4]:
profile = ProfileReport(df, title='Pandas Profiling Report')

In [5]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Conclusions of the EDA

- Categorical: 6
- Numeric: 6

Missing Cells: 23
* 10 from temp
* 13 from hum

Correlation of temp with atemp

## 3. Missing Values (Data Imputation)

### Understanding temp

In [17]:
df[df.temp.isnull() == True]

Unnamed: 0,dteday,season,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
1681,3/15/2011,spring,16,0,Tuesday,1,Slightly cloudy,,0.3939,0.46,0.3284,114
6953,10/22/2011,winter,8,0,Saturday,0,Clear,,0.4091,0.71,0.0,114
7566,11/16/2011,winter,21,0,Wednesday,1,Light Snow,,0.4394,0.88,0.2836,75
7835,11/28/2011,winter,3,0,Monday,1,Clear,,0.4394,0.88,0.2239,5
8437,12/23/2011,spring,5,0,Friday,1,Clear,,0.3485,0.93,0.194,8
8877,1/10/2012,spring,18,0,Tuesday,1,Clear,,0.4091,0.4,0.1045,385
9708,2/14/2012,spring,11,0,Tuesday,1,Slightly cloudy,,0.303,0.45,0.1642,112
11758,5/10/2012,summer,3,0,Thursday,1,Clear,,0.4545,0.88,0.2239,3
15752,10/23/2012,winter,13,0,Tuesday,1,Slightly cloudy,,0.6212,0.44,0.0,303
16324,11/17/2012,winter,22,0,Saturday,0,Slightly cloudy,,0.3333,0.57,0.1045,145


### Understanding hum

In [19]:
df[df.hum.isnull() == True]

Unnamed: 0,dteday,season,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
424,1/19/2011,spring,17,0,Wednesday,1,Clear,0.36,0.3333,,0.3284,197
514,1/23/2011,spring,13,0,Sunday,0,Clear,0.14,0.1061,,0.3881,87
1251,2/25/2011,spring,7,0,Friday,1,Light Snow,0.34,0.3333,,0.1343,35
3141,5/15/2011,summer,19,0,Sunday,0,Clear,0.56,0.5303,,0.1045,231
5126,8/6/2011,fall,12,0,Saturday,0,Clear,0.8,0.7576,,0.2239,382
6239,9/22/2011,fall,13,0,Thursday,1,Slightly cloudy,0.66,0.6061,,0.194,193
8381,12/20/2011,winter,21,0,Tuesday,1,Clear,0.36,0.3636,,0.0896,143
11468,4/28/2012,summer,1,0,Saturday,0,Clear,0.36,0.3333,,0.2985,67
13224,7/10/2012,fall,5,0,Tuesday,1,Clear,0.66,0.6061,,0.1343,42
13789,8/2/2012,fall,18,0,Thursday,1,Clear,0.8,0.7727,,0.2239,767


### Data Imputation Methods

#### 3.1 Delete rows

In [26]:
df_missing = df[(df.temp.notnull()) & (df.hum.notnull())]
df_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17356 entries, 0 to 17378
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      17356 non-null  object 
 1   season      17356 non-null  object 
 2   hr          17356 non-null  int64  
 3   holiday     17356 non-null  int64  
 4   weekday     17356 non-null  object 
 5   workingday  17356 non-null  int64  
 6   weathersit  17356 non-null  object 
 7   temp        17356 non-null  float64
 8   atemp       17356 non-null  float64
 9   hum         17356 non-null  float64
 10  windspeed   17356 non-null  float64
 11  cnt         17356 non-null  int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 1.7+ MB


#### 3.2 Mean

In [21]:
means = df.mean()
means

  means = df.mean()


hr             11.546752
holiday         0.028770
workingday      0.682721
temp            0.497035
atemp           0.475775
hum             0.627176
windspeed       0.190098
cnt           189.463088
dtype: float64

In [27]:
means_missing = df_missing.mean()
means_missing

  means_missing = df_missing.mean()


hr             11.546266
holiday         0.028808
workingday      0.682703
temp            0.497044
atemp           0.475824
hum             0.627157
windspeed       0.190106
cnt           189.508239
dtype: float64