In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
hotel = pd.read_csv('../hotel_reservations.csv')
hotel['label_avg_price_per_room'] = hotel['avg_price_per_room'].apply(lambda x: 1 if x <= 85 else 2 if x <= 115 else 3)
hotel.drop(columns=['avg_price_per_room', 'no_of_children', 'required_car_parking_space', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'Booking_ID'], inplace=True)
hotel

Unnamed: 0,no_of_adults,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,no_of_special_requests,booking_status,label_avg_price_per_room
0,2,1,2,Meal Plan 1,Room_Type 1,224,2017,10,2,Offline,0,Not_Canceled,1
1,2,2,3,Not Selected,Room_Type 1,5,2018,11,6,Online,1,Not_Canceled,2
2,1,2,1,Meal Plan 1,Room_Type 1,1,2018,2,28,Online,0,Canceled,1
3,2,0,2,Meal Plan 1,Room_Type 1,211,2018,5,20,Online,0,Canceled,2
4,2,1,1,Not Selected,Room_Type 1,48,2018,4,11,Online,0,Canceled,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,2,6,Meal Plan 1,Room_Type 4,85,2018,8,3,Online,1,Not_Canceled,3
36271,2,1,3,Meal Plan 1,Room_Type 1,228,2018,10,17,Online,2,Canceled,2
36272,2,2,6,Meal Plan 1,Room_Type 1,148,2018,7,1,Online,2,Not_Canceled,2
36273,2,0,3,Not Selected,Room_Type 1,63,2018,4,21,Online,0,Canceled,2


In [15]:
X = hotel.drop('label_avg_price_per_room', axis=1)
y = hotel['label_avg_price_per_room'].apply(lambda x: x-1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numeric_cols = [cname for cname in X.columns if cname not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [17]:
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [18]:
cv_scores = cross_val_score(model_rf, X, y, cv=5)
print(f"Precisão média com validação cruzada: {cv_scores.mean() * 100:.2f}%")

Precisão média com validação cruzada: 85.01%


In [19]:
model_rf.fit(X_train, y_train)  

In [20]:
y_pred = model_rf.predict(X_test)  
accuracy = accuracy_score(y_test, y_pred)  
print(f"Precisão no conjunto de teste: {accuracy * 100:.2f}%")

Precisão no conjunto de teste: 85.46%


# Salvando modelo no bucket S3

In [None]:
import boto3
import joblib
from io import BytesIO

file = model_rf
path = 's3://sprint-5-equipe-2'

def write_joblib(file, path):
    if path[:5] == 's3://':
        s3_bucket, s3_key = path.split('/')[2], path.split('/')[3:]
        s3_key = '/'.join(s3_key)
        with BytesIO() as f:
            joblib.dump(file, f)
            f.seek(0)
            boto3.client('s3').upload_fileobj(Bucket=s3_bucket, Key=s3_key, Fileobj=f)

    else:
        with open(path, 'wb') as f:
            joblib.dump(file, f)

write_joblib(file, path + '/model.joblib')