In [1]:
import datetime as dt
import pandas as pd
import os
import dask
import dask.dataframe as dd
import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

def preprocessingDataFrame(df):
    
    df = pd.DataFrame(df)
      
    # filter the data frame to take into account only the data from the stations in service
    df = df[df['status']== 'IN_SERVICE']

    # convert Unix timestamp column to datetime object
    df['datetime'] = pd.to_datetime(df['last_reported'], unit='s')
    
    # extract day, month, year, hour, and minute components and store in new columns
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday

    # group the data and get the average of ocupation
    df['occupation'] = (df['num_bikes_available'])/((df['num_bikes_available']+df['num_docks_available']))
     
    grouping_columns = ['station_id','year','month','day','weekday','hour']

    df_grouped = df.groupby(grouping_columns)['occupation'].mean().reset_index()

    # we sort the data 
    df_grouped = df_grouped.sort_values(by=['station_id', 'year', 'month', 'day', 'hour'])
    
    # we create columns for the ocupation in the same station for the previous 4 hours
    df_grouped['occupation_1h_before'] = df_grouped.groupby('station_id')['occupation'].shift(1)
    df_grouped['occupation_2h_before'] = df_grouped.groupby('station_id')['occupation'].shift(2)
    df_grouped['occupation_3h_before'] = df_grouped.groupby('station_id')['occupation'].shift(3)
    df_grouped['occupation_4h_before'] = df_grouped.groupby('station_id')['occupation'].shift(4)

    # Delete the infinity and NaN values

    df_grouped = df_grouped.dropna()
    
    return df_grouped
    
def iterateCSV(path):
   
    files = os.listdir(path)
    
    counter = 1
    
    for file in files:
        if file.endswith(".csv"):
        
            file_path = os.path.join(path, file)
        
            try:
                data = pd.read_csv(file_path)
            except UnicodeDecodeError:
                data = pd.read_csv(file_path, encoding='latin-1')
           
            data = preprocessingDataFrame(data)
            # we get the name from the month and year in the file
            match = re.search(r"\d{4}_\d{2}", file_path)
            string = match.group(0)
            data.to_csv(string+"_processed.csv",index=False)


path = os.getcwd()
iterateCSV(path)