In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [54]:
bicing_status_cols = ['last_updated', 'station_id', 'num_bikes_available', 'num_docks_available']
bicing_status = pd.read_csv(r'..\data\raw\bicing_stations_status\2022\2022_12_Desembre_BicingNou_ESTACIONS.csv', usecols=bicing_status_cols)

In [53]:
bicing_status.sample(20)

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_updated
1569079,166,14,13,1670781595
168233,328,10,17,1669949095
4164478,221,2,21,1672322104
1126583,22,2,17,1670518789
2049125,512,0,26,1671066603
3659891,95,19,14,1672022996
1627961,475,10,9,1670816396
980895,288,0,32,1670432075
658502,402,10,21,1670240402
141697,44,3,18,1669933500


In [45]:
bicing_status.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4512523 entries, 0 to 4512522
Data columns (total 4 columns):
 #   Column               Dtype
---  ------               -----
 0   station_id           int64
 1   num_bikes_available  int64
 2   num_docks_available  int64
 3   last_updated         int64
dtypes: int64(4)
memory usage: 137.7 MB


In [46]:
bicing_status.shape

(4512523, 4)

In [47]:
bicing_status.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
station_id,4512523.0,261.5072,149.427218,1.0,132.0,263.0,390.0,519.0
num_bikes_available,4512523.0,10.72229,7.664918,0.0,5.0,10.0,16.0,54.0
num_docks_available,4512523.0,15.6636,7.858505,0.0,10.0,16.0,21.0,54.0
last_updated,4512523.0,1671189000.0,773538.357338,1669849000.0,1670520000.0,1671190000.0,1671859000.0,1672528000.0


In [48]:
bicing_status.memory_usage(deep=True).sum()

144400864

In [49]:
bicing_status.isna().sum()

station_id             0
num_bikes_available    0
num_docks_available    0
last_updated           0
dtype: int64

In [109]:
def tweak_bicing_status(df):
    weekend = {1: False, 2: False, 3: False, 4: False, 5: False, 6: True, 7: True}
    
    def int64_to_uint16(df_):
        cols = df_.select_dtypes('int64')
        return (df_
                .astype({col:'uint16' for col in cols}))
    return (df
            .assign(last_updated=pd.to_datetime(df['last_updated'], unit='s'), 
                    year=lambda df_: df_['last_updated'].dt.year.astype('uint16'),
                    month=lambda df_: df_['last_updated'].dt.month.astype('uint8'),
                    day=lambda df_: df_['last_updated'].dt.day.astype('uint8'),
                    weekday=lambda df_: df_['last_updated'].dt.isocalendar().day.astype('uint8'),
                    hour=lambda df_: df_['last_updated'].dt.hour.astype('uint8'),
                    weekend=lambda df_: df_['weekday'].map(weekend))
            .pipe(int64_to_uint16)
            .drop(columns='last_updated')
            .drop_duplicates()
    )

In [107]:
tweak_bicing_status(bicing_status).info()

<class 'pandas.core.frame.DataFrame'>
Index: 1286126 entries, 0 to 4512515
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   station_id           1286126 non-null  uint16
 1   num_bikes_available  1286126 non-null  uint16
 2   num_docks_available  1286126 non-null  uint16
 3   year                 1286126 non-null  uint16
 4   month                1286126 non-null  uint8 
 5   day                  1286126 non-null  uint8 
 6   weekday              1286126 non-null  uint8 
 7   hour                 1286126 non-null  uint8 
 8   weekend              1286126 non-null  bool  
dtypes: bool(1), uint16(4), uint8(4)
memory usage: 25.8 MB


In [108]:
tweak_bicing_status(bicing_status).head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,year,month,day,weekday,hour,weekend
0,1,29,15,2022,11,30,3,22,False
1,2,18,10,2022,11,30,3,22,False
2,3,11,16,2022,11,30,3,22,False
3,4,13,8,2022,11,30,3,22,False
4,5,13,26,2022,11,30,3,22,False
