# Limpieza de datos

En este script vamos a revisar los datos y asegurarnos que estan limpios para realizar un analisis correcto de ellos

In [201]:
import pandas as pd

## Carga de archivo y revisión de datos

In [202]:
df = pd.read_csv('../data/bike_sharing_modified.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1.0,2011-01-01,1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2.0,2011-01-01,1.0,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3.0,2011-01-01,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4.0,2011-01-01,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5.0,2011-01-01,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   instant         17524 non-null  object
 1   dteday          17542 non-null  object
 2   season          17519 non-null  object
 3   yr              17515 non-null  object
 4   mnth            17515 non-null  object
 5   hr              17489 non-null  object
 6   holiday         17552 non-null  object
 7   weekday         17539 non-null  object
 8   workingday      17527 non-null  object
 9   weathersit      17526 non-null  object
 10  temp            17546 non-null  object
 11  atemp           17534 non-null  object
 12  hum             17512 non-null  object
 13  windspeed       17545 non-null  object
 14  casual          17523 non-null  object
 15  registered      17530 non-null  object
 16  cnt             17534 non-null  object
 17  mixed_type_col  15973 non-null  object
dtypes: obj

Looks like all columns include mixed data in them, the first few columns show that some columns are integers but maybe the data could include string data for months or weeks where the data is probably meant to be a value between 1-12 or 1-7 or 0 based index. Let's dig in a bit deeper.

## Clean up Instant column

A simple observation of the data in the instant column shows that it is an incremental int value, so we are going to proceed and ensure all values in this column become numerical integer values.

After this clean up we save the dataframe to a new csv.

In [204]:
# Turn instant column to numeric and interpolate any missing values
df['instant'] = pd.to_numeric(df['instant'], errors='coerce')
df['instant'] = df['instant'].interpolate(method='linear')
df['instant'] = df['instant'].astype(int)

print("Instant data type:", df['instant'].dtype)
print("Missing values in 'instant':", df['instant'].isnull().sum())

df.head()

Instant data type: int64
Missing values in 'instant': 0


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1.0,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


In [205]:
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

## Clean up the dteday

This column was reported as an object type in the `df.info()` report this may result from data including some missing values bad formatting or other issues related with the data. The column is obviously a date type so a first cleanup is to trim all values. 

In [206]:
df['dteday'] = df['dteday'].str.strip()
df['dteday'] = pd.to_datetime(df['dteday'], errors='coerce')
print("Dteday data type:", df['dteday'].dtype)
print("Missing values in 'dteday':", df['dteday'].isna().sum())
df.head()

Dteday data type: datetime64[ns]
Missing values in 'dteday': 195


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1.0,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


In [207]:
df[df['dteday'].isna()].head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
54,55,NaT,1.0,0.0,1.0,9.0,0.0,1.0,1.0,1.0,0.16,5.7288,0.43,0.3881,7.0,81.0,88.0,unknown
71,72,NaT,1.0,0.0,1.0,2.0,0.0,2.0,1.0,1.0,0.14,0.1515,0.63,0.1343,0.0,1.0,1.0,bad
280,281,NaT,1.0,0.0,1.0,3.0,0.0,4.0,1.0,1.0,0.14,0.1212,0.5,0.3284,0.0,3.0,3.0,649
471,472,NaT,1.0,0.0,1.0,16.0,0.0,5.0,1.0,1.0,0.16,0.1364,0.26,0.3582,0.0,97.0,97.0,121
478,479,NaT,1.0,0.0,1.0,23.0,0.0,5.0,1.0,1.0,0.06,0.0455,0.38,0.3284,0.0,27.0,27.0,739


We are going to leave these dates as NAs, we could apply the same interpolation but we are going to leave imputation to EDA.

In [208]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

## Clean up season

Season is originally casted as object due to bad formatting but also includes values beyond the 4 seasons that exist in the planet earth. 

In [209]:
df['season'] = df['season'].str.strip()
df['season'] = pd.to_numeric(df['season'], errors='coerce').astype('Int64')
df.loc[~df['season'].isin([1, 2, 3, 4]), 'season'] = pd.NA

print("Season data type:", df['season'].dtype)
print("Missing values in 'season':", df['season'].isna().sum())
print("% of missing values in 'season':", df['season'].isna().mean() * 100)
print(df['season'].value_counts())
df.head()


Season data type: Int64
Missing values in 'season': 432
% of missing values in 'season': 2.437098048064989
season
3    4483
2    4388
4    4222
1    4201
Name: count, dtype: Int64


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


In [210]:
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

## Clean up yr, mnth, hr 

Looking at these columns they all look like integers, we should cast them as such and ensure the values lie within appropiate values 

In [211]:
df['yr'] = pd.to_numeric(df['yr'], errors='coerce').astype('Int64')
df['mnth'] = pd.to_numeric(df['mnth'], errors='coerce').astype('Int64')
df['hr'] = pd.to_numeric(df['hr'], errors='coerce').astype('Int64')

In [212]:
print(df['yr'].value_counts())
print(df['mnth'].value_counts())
print(df['hr'].value_counts())

yr
1      8674
0      8642
51        4
22        3
73        3
       ... 
140       1
13        1
184       1
38        1
93        1
Name: count, Length: 120, dtype: Int64
mnth
7       1495
8       1474
3       1473
5       1473
12      1466
        ... 
96         1
890        1
172        1
1089       1
224        1
Name: count, Length: 171, dtype: Int64
hr
16     733
14     731
18     730
19     730
9      730
      ... 
231      1
360      1
906      1
116      1
987      1
Name: count, Length: 186, dtype: Int64


In [213]:
# find all mnth records that are not between 1 and 12 and set them to NA
invalid_mnth = df[~df['mnth'].isin(range(1, 13))]
df.loc[invalid_mnth.index, 'mnth'] = pd.NA
# find all hr records that are not between 0 and 23 and set them to NA
invalid_hr = df[~df['hr'].isin(range(0, 24))]
df.loc[invalid_hr.index, 'hr'] = pd.NA
# find all yr records that are not between 0 and 1 and set them to NA
invalid_yr = df[~df['yr'].isin([0, 1])]
df.loc[invalid_yr.index, 'yr'] = pd.NA


In [214]:
print(df['yr'].value_counts())
print(df['mnth'].value_counts())
print(df['hr'].value_counts())

yr
1    8674
0    8642
Name: count, dtype: Int64
mnth
7     1495
8     1474
3     1473
5     1473
12    1466
11    1440
9     1429
10    1427
6     1425
4     1416
1     1413
2     1336
Name: count, dtype: Int64
hr
16    733
14    731
18    730
9     730
19    730
22    729
20    729
13    729
11    725
15    722
8     720
12    720
21    720
23    720
17    719
6     718
0     718
10    715
1     713
2     709
7     709
5     708
3     693
4     689
Name: count, dtype: Int64


In [215]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

In [216]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17552 non-null  object        
 7   weekday         17539 non-null  object        
 8   workingday      17527 non-null  object        
 9   weathersit      17526 non-null  object        
 10  temp            17546 non-null  object        
 11  atemp           17534 non-null  object        
 12  hum             17512 non-null  object        
 13  windspeed       17545 non-null  object        
 14  casual          17523 non-null  object        
 15  re

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0,1,1,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0,1,2,0.0,6.0,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0,1,3,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0,1,4,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


# Clean up weekday

Weekday should be a value between 0 and 6, there are values with bad format but also values that are outside this range

In [217]:
df['weekday'].value_counts()

weekday
6.0        2386
4.0        2373
5.0        2358
0.0        2353
1.0        2349
           ... 
951.0         1
594.0         1
 733.0        1
372.0         1
231.0         1
Name: count, Length: 168, dtype: int64

In [218]:
df['weekday'] = df['weekday'].str.strip()
df['weekday'] = pd.to_numeric(df['weekday'], errors='coerce').astype('Int64')
df.loc[~df['weekday'].isin(range(0, 7)), 'weekday'] = pd.NA
print(df['weekday'].value_counts())

weekday
5    2494
6    2490
0    2485
1    2479
4    2467
2    2454
3    2453
Name: count, dtype: Int64


In [219]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

In [222]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17552 non-null  object        
 7   weekday         17322 non-null  Int64         
 8   workingday      17527 non-null  object        
 9   weathersit      17526 non-null  object        
 10  temp            17546 non-null  object        
 11  atemp           17534 non-null  object        
 12  hum             17512 non-null  object        
 13  windspeed       17545 non-null  object        
 14  casual          17523 non-null  object        
 15  re

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,0.0,6,0.0,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0,1,1,0.0,6,0.0,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0,1,2,0.0,6,0.0,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0,1,3,0.0,6,0.0,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0,1,4,0.0,6,0.0,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


# Clean holiday, working day

Working day looks to have values between 0 and 1 most likely a boolean there are a lot of records with values outside this range

In [223]:
print(df['holiday'].value_counts())
print(df['workingday'].value_counts())

holiday
0.0        16048
 0.0         828
1.0          483
invalid       25
 1.0          22
           ...  
118.0          1
756.0          1
763.0          1
944.0          1
528.0          1
Name: count, Length: 101, dtype: int64
workingday
1.0        11225
0.0         5235
 1.0         606
 0.0         278
?             20
           ...  
938.0          1
 701.0         1
 ?             1
303.0          1
28.0           1
Name: count, Length: 119, dtype: int64


In [224]:
df['holiday'] = pd.to_numeric(df['holiday'], errors='coerce').astype('Int64')
df.loc[~df['holiday'].isin([0, 1]), 'holiday'] = pd.NA
df['workingday'] = pd.to_numeric(df['workingday'], errors='coerce').astype('Int64')
df.loc[~df['workingday'].isin([0, 1]), 'workingday'] = pd.NA
print(df['holiday'].value_counts())
print(df['workingday'].value_counts())

holiday
0    16876
1      505
Name: count, dtype: Int64
workingday
1    11831
0     5513
Name: count, dtype: Int64


In [226]:
# cast as boolean
df['holiday'] = df['holiday'].astype('boolean')
df['workingday'] = df['workingday'].astype('boolean')
print(df['holiday'].value_counts())
print(df['workingday'].value_counts())

holiday
False    16876
True       505
Name: count, dtype: Int64
workingday
True     11831
False     5513
Name: count, dtype: Int64


In [227]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

In [228]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17381 non-null  boolean       
 7   weekday         17322 non-null  Int64         
 8   workingday      17344 non-null  boolean       
 9   weathersit      17526 non-null  object        
 10  temp            17546 non-null  object        
 11  atemp           17534 non-null  object        
 12  hum             17512 non-null  object        
 13  windspeed       17545 non-null  object        
 14  casual          17523 non-null  object        
 15  re

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,False,6,False,1.0,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0,1,1,False,6,False,1.0,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0,1,2,False,6,False,1.0,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0,1,3,False,6,False,1.0,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0,1,4,False,6,False,1.0,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


# Clean weathersit

In [229]:
df['weathersit'].value_counts()

weathersit
1.0      10809
2.0       4289
3.0       1334
 1.0       547
 2.0       239
         ...  
433.0        1
139.0        1
834.0        1
630.0        1
393.0        1
Name: count, Length: 172, dtype: int64

In [230]:
df['weathersit'] = df['weathersit'].str.strip()
df['weathersit'] = pd.to_numeric(df['weathersit'], errors='coerce').astype('Int64')
df['weathersit'].value_counts()

weathersit
1      11356
2       4528
3       1393
67         3
93         3
       ...  
630        1
834        1
139        1
433        1
393        1
Name: count, Length: 159, dtype: Int64

In [233]:
# if value is not within 1-3, set to NA
df.loc[~df['weathersit'].isin([1, 2, 3]), 'weathersit'] = pd.NA
print(df['weathersit'].value_counts())
print("NA:", df['weathersit'].isna().sum())

weathersit
1    11356
2     4528
3     1393
Name: count, dtype: Int64
NA: 449


In [236]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

In [235]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17381 non-null  boolean       
 7   weekday         17322 non-null  Int64         
 8   workingday      17344 non-null  boolean       
 9   weathersit      17277 non-null  Int64         
 10  temp            17546 non-null  object        
 11  atemp           17534 non-null  object        
 12  hum             17512 non-null  object        
 13  windspeed       17545 non-null  object        
 14  casual          17523 non-null  object        
 15  re

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,False,6,False,1,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0,1,1,False,6,False,1,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0,1,2,False,6,False,1,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0,1,3,False,6,False,1,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0,1,4,False,6,False,1,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


## Clean up temp, atemp, hum and windspeed

In [237]:
print(df['temp'].value_counts())
print(df['atemp'].value_counts())
print(df['hum'].value_counts())
print(df['windspeed'].value_counts())

temp
0.62      689
0.64      651
0.66      651
0.7       643
0.6       635
         ... 
548.44      1
 ?          1
439.4       1
388.56      1
571.76      1
Name: count, Length: 281, dtype: int64
atemp
0.6212       925
0.5152       583
0.4091       578
0.3333       570
0.6667       568
            ... 
201.5455       1
33.2755        1
546.5303       1
 21.1232       1
9.0002         1
Name: count, Length: 320, dtype: int64
hum
0.88      627
0.83      589
0.94      532
0.87      462
0.7       402
         ... 
47.58       1
958.79      1
26.66       1
 null       1
21.0        1
Name: count, Length: 337, dtype: int64
windspeed
0.0                     2076
0.1343                  1651
0.1642                  1610
0.194                   1580
0.1045                  1530
                        ... 
 6.8991999999999996        1
0.8358                     1
11.3298                    1
671.1343                   1
12.4792                    1
Name: count, Length: 219, dtype: int64


Looks like values should be floats within 0 and 1 range

In [239]:
df['temp'] = pd.to_numeric(df['temp'], errors='coerce')
df['atemp'] = pd.to_numeric(df['atemp'], errors='coerce')
df['hum'] = pd.to_numeric(df['hum'], errors='coerce')
df['windspeed'] = pd.to_numeric(df['windspeed'], errors='coerce')

print(df['temp'].value_counts())
print(df['atemp'].value_counts())
print(df['hum'].value_counts())
print(df['windspeed'].value_counts())

temp
0.62      732
0.64      689
0.70      682
0.66      682
0.36      669
         ... 
439.40      1
388.56      1
999.46      1
325.48      1
571.76      1
Name: count, Length: 229, dtype: int64
atemp
0.6212      974
0.5152      617
0.4091      612
0.3333      601
0.6667      594
           ... 
201.5455      1
33.2755       1
546.5303      1
21.1232       1
9.0002        1
Name: count, Length: 259, dtype: int64
hum
0.88      657
0.83      616
0.94      554
0.87      483
0.70      422
         ... 
479.77      1
29.70       1
788.66      1
47.58       1
21.00       1
Name: count, Length: 255, dtype: int64
windspeed
0.0000      2189
0.1343      1721
0.1642      1690
0.1940      1656
0.1045      1605
            ... 
6.8992         1
0.8358         1
11.3298        1
671.1343       1
12.4792        1
Name: count, Length: 187, dtype: int64


In [240]:
# let's remove values that are not within 0 and 1 range
df.loc[(df['temp'] < 0) | (df['temp'] > 1), 'temp'] = pd.NA
df.loc[(df['atemp'] < 0) | (df['atemp'] > 1), 'atemp'] = pd.NA
df.loc[(df['hum'] < 0) | (df['hum'] > 1), 'hum'] = pd.NA
df.loc[(df['windspeed'] < 0) | (df['windspeed'] > 1), 'windspeed'] = pd.NA

print(df['temp'].value_counts())
print(df['atemp'].value_counts())
print(df['hum'].value_counts())
print(df['windspeed'].value_counts())

temp
0.62    732
0.64    689
0.70    682
0.66    682
0.36    669
0.60    668
0.30    649
0.34    641
0.40    613
0.32    612
0.56    573
0.54    566
0.46    562
0.26    561
0.72    561
0.52    553
0.42    546
0.50    533
0.24    516
0.74    514
0.44    501
0.22    414
0.76    401
0.38    369
0.20    347
0.68    346
0.80    320
0.58    306
0.28    297
0.48    289
0.16    229
0.82    215
0.78    165
0.18    153
0.84    141
0.14    136
0.86    133
0.90     91
0.12     76
0.88     53
0.92     51
0.10     48
0.02     18
0.96     17
0.08     17
0.94     16
0.06     16
0.04     15
0.98      1
1.00      1
Name: count, dtype: int64
atemp
0.6212    974
0.5152    617
0.4091    612
0.3333    601
0.6667    594
         ... 
0.0000      2
0.9848      2
0.8664      1
0.9545      1
1.0000      1
Name: count, Length: 66, dtype: int64
hum
0.88    657
0.83    616
0.94    554
0.87    483
0.70    422
       ... 
0.12      1
0.08      1
0.10      1
0.97      1
0.91      1
Name: count, Length: 89, dtype: int

In [242]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)

In [241]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17381 non-null  boolean       
 7   weekday         17322 non-null  Int64         
 8   workingday      17344 non-null  boolean       
 9   weathersit      17277 non-null  Int64         
 10  temp            17304 non-null  float64       
 11  atemp           17278 non-null  float64       
 12  hum             17282 non-null  float64       
 13  windspeed       17319 non-null  float64       
 14  casual          17523 non-null  object        
 15  re

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,False,6,False,1,0.24,0.2879,0.81,0.0,3.0,13.0,16.0,702
1,2,2011-01-01,1,0,1,1,False,6,False,1,0.22,0.2727,0.8,0.0,8.0,32.0,40.0,831
2,3,2011-01-01,1,0,1,2,False,6,False,1,0.22,0.2727,0.8,0.0,5.0,27.0,32.0,175
3,4,2011-01-01,1,0,1,3,False,6,False,1,0.24,0.2879,0.75,0.0,3.0,10.0,13.0,581
4,5,2011-01-01,1,0,1,4,False,6,False,1,0.24,0.2879,0.75,0.0,0.0,1.0,1.0,659


## Clean up casual, registered, cnt and mixed_type_col

These columns look like int type where is not an actual range to be identified here so we will just set to NA any value that is not a number

In [243]:
df['casual'] = pd.to_numeric(df['casual'], errors='coerce').astype('Int64')
df['registered'] = pd.to_numeric(df['registered'], errors='coerce').astype('Int64')
df['cnt'] = pd.to_numeric(df['cnt'], errors='coerce').astype('Int64')
df['mixed_type_col'] = pd.to_numeric(df['mixed_type_col'], errors='coerce').astype('Int64')

In [248]:
print("General info after cleaning casual, registered, cnt and mixed_type_col:")
print(df.info())
print("Missing values in each column:")
print(df.isna().sum())
df.head()

General info after cleaning casual, registered, cnt and mixed_type_col:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   instant         17726 non-null  int64         
 1   dteday          17531 non-null  datetime64[ns]
 2   season          17294 non-null  Int64         
 3   yr              17316 non-null  Int64         
 4   mnth            17267 non-null  Int64         
 5   hr              17259 non-null  Int64         
 6   holiday         17381 non-null  boolean       
 7   weekday         17322 non-null  Int64         
 8   workingday      17344 non-null  boolean       
 9   weathersit      17277 non-null  Int64         
 10  temp            17304 non-null  float64       
 11  atemp           17278 non-null  float64       
 12  hum             17282 non-null  float64       
 13  windspeed       17319 non-null  fl

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1,2011-01-01,1,0,1,0,False,6,False,1,0.24,0.2879,0.81,0.0,3,13,16,702
1,2,2011-01-01,1,0,1,1,False,6,False,1,0.22,0.2727,0.8,0.0,8,32,40,831
2,3,2011-01-01,1,0,1,2,False,6,False,1,0.22,0.2727,0.8,0.0,5,27,32,175
3,4,2011-01-01,1,0,1,3,False,6,False,1,0.24,0.2879,0.75,0.0,3,10,13,581
4,5,2011-01-01,1,0,1,4,False,6,False,1,0.24,0.2879,0.75,0.0,0,1,1,659


In [None]:
# let's save the cleaned data again
df.to_csv('../data/bike_sharing_cleaned.csv', index=False)