In [60]:
from datetime import datetime
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

current_directory = os.getcwd()

PROJECT_ID = 'pf-group-03-nyc-taxis-427021'
BUCKET_NAME = 'datasets-taxis'
SAVE_BUCKET = current_directory
TAXI_TYPE = 'green'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f"{current_directory}/pf-group-03-nyc-taxis-427021-0b50b328c30e.json"

def read_data(taxi_type):
    credentials = service_account.Credentials.from_service_account_file(
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
    )
    client = bigquery.Client(credentials=credentials, project=PROJECT_ID)
    query = f"SELECT * FROM `tablas.ML_{taxi_type}`"
    df = client.query(query).to_dataframe()
    return df

def prepare_data(df):
    # Codificar las variables categóricas usando LabelEncoder
    label_encoder = LabelEncoder()
    df['distrito_subida'] = label_encoder.fit_transform(df['distrito_subida'])
    df['dia_semana'] = label_encoder.fit_transform(df['dia_semana'])
    return df[['distrito_subida', 'hora', 'cantidad', 'dia_semana']], label_encoder

def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

def train_model(X_train, y_train):
    rf = RandomForestRegressor(max_depth=5, random_state=42)
    rf.fit(X_train, y_train)
    return rf

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    print("Test MAE - ", mae)
    return mae

def save_model(model, save_bucket, taxi_type):
    import joblib
    timestamp = datetime.now().strftime("%Y_%m_%d")
    model_path = f"{save_bucket}/rf_{taxi_type}_model_{timestamp}.pkl"
    joblib.dump(model, model_path)

def main():
    df = read_data(TAXI_TYPE)
    df, label_encoder = prepare_data(df)
    X = df[['distrito_subida', 'hora', 'dia_semana']]
    y = df['cantidad']
    X_train, X_test, y_train, y_test = split_data(X, y)
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)
    save_model(model, SAVE_BUCKET, TAXI_TYPE)

if __name__ == "__main__":
    main()




Test MAE -  10.573701637965415
