In [74]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
from math import pi, sin, cos
from datetime import datetime
from holidays_es import get_provinces, Province
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import mlflow
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn import set_config; set_config(display='diagram')
import pickle
from typing import Dict, List, Tuple, Sequence
from sklearn.metrics import mean_squared_error as MSE

In [1]:
from google.cloud import storage

BUCKET_NAME = "bicimad_data"

storage_filename = "alldata.csv"
local_filename = "alldata.csv"

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(storage_filename)
blob.download_to_filename(f'../raw_data/{local_filename}')

In [4]:
df = pd.read_csv('../raw_data/alldata.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df.shape

(5373414, 28)

In [6]:
def compress(df, **kwargs):
    """
    Reduces size of dataframe by downcasting numerical columns
    """
    input_size = df.memory_usage(index=True).sum()/ 1024**2
    print("old dataframe size: ", round(input_size,2), 'MB')
    
    in_size = df.memory_usage(index=True).sum()
    for t in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=t))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=t)
    out_size = df.memory_usage(index=True).sum()
    ratio = (1 - round(out_size / in_size, 2)) * 100
    
    print("optimized size by {} %".format(round(ratio,2)))
    print("new dataframe size: ", round(out_size / 1024**2,2), " MB")
    return df

In [8]:
df = compress(df)

old dataframe size:  579.07 MB
optimized size by 0.0 %
new dataframe size:  579.07  MB


In [10]:
df = df.drop(columns=['activate', 'name', 'light', 'total_bases', 'free_bases', 
                      'longitude', 'no_available', 'address', 'latitude', 'id', 'date'])

In [9]:
df.year = df.year.astype(str)

In [11]:
df['datetime'] = df['datetime'].apply(lambda x: x[0:10]+' '+x[10:]+':00')

In [12]:
df['datetime']=pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')

In [13]:
df = df.set_index('datetime')

In [29]:
df.number = df.number.astype(str)

In [30]:
num_col = df.select_dtypes(['int8','float32']).drop(columns='dock_bikes')

In [31]:
cat_col = df.select_dtypes(['object','bool'])

In [32]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [33]:
preproc = make_column_transformer(
    (num_transformer, num_col.columns),
    (cat_transformer, cat_col.columns),
    remainder='passthrough')

In [34]:
pipe = make_pipeline(preproc)

In [35]:
df_2021 = df[df['year'] == '2021'].copy()

In [36]:
pipe.fit(df_2021)

In [None]:
for chunk in df.to_numpy()

In [38]:
df.to_numpy()

array([[0, '1a', 3, ..., 0.6234897971153259, 0.5, 0.8660253882408142],
       [0, '1a', 3, ..., 0.6234897971153259, 0.5, 0.8660253882408142],
       [0, '1a', 3, ..., 0.6234897971153259, 0.5, 0.8660253882408142],
       ...,
       [0, '9', 19, ..., 1.0, 0.0, 1.0],
       [0, '9', 19, ..., 1.0, 0.0, 1.0],
       [0, '9', 18, ..., 1.0, 0.0, 1.0]], dtype=object)

In [77]:
df.shape

(5373414, 16)

In [78]:
def initialize_model():
    RFR = RandomForestRegressor(n_estimators = 100, max_depth=15, random_state=0,n_jobs = -1)
    return RFR

In [86]:
chunk = 100_000
#inicializar modelo fora do for loop
model = initialize_model()

for i in range(0, (len(df)+1), chunk):
    print(i)
    # transformaçao
    df_processed = pipe.transform(df.iloc[i:i+chunk])
    X = df_processed[:,:-1]
    y = df_processed[:,-1]
    # fit Rede Neural
    model.fit(X,y)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000


In [91]:
pre_test = model.predict(X)
score = MSE(y,pre_test)
print(score.mean(), pre_test.mean())

13.282201387361178 8.704545994037066


In [41]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [45]:
X_1, y_1 = create_dataset(df.iloc[:100], df.dock_bikes.iloc[:100], 24)

In [50]:
y_1.shape

(76,)

In [51]:
X = df.drop(columns='dock_bikes')
y= df.dock_bikes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [54]:
df_train, df_test = train_test_split(df, test_size=0.20)

In [55]:
df_train_processed = pipe.transform(df_train)

In [59]:
y_train = df_train_processed[:,-1]

In [60]:
X_train = df_train_processed[:,:-1]

In [61]:
X_train.shape, y_train.shape

((4298731, 287), (4298731,))

In [62]:
df_test_processed = pipe.transform(df_test)

In [63]:
X_test = df_test_processed[:,:-1]

In [64]:
y_test = df_test_processed[:,-1]

In [75]:
from sklearn.ensemble import RandomForestRegressor
def RandomForest_model(xd_train,yd_train, xd_test,yd_test):
    RFR = RandomForestRegressor(n_estimators = 100, max_depth=15, random_state=0,n_jobs = -1)
    RFR.fit(xd_train,yd_train)
    pre_test = RFR.predict(xd_test)
    score = MSE(yd_test,pre_test)
    return score, pre_test

In [None]:
def initialize_model():
    RFR = RandomForestRegressor(n_estimators = 100, max_depth=15, random_state=0,n_jobs = -1)
    return RFR

In [76]:
score, pre_test = RandomForest_model(X_train, y_train, X_test, y_test)

KeyboardInterrupt: 