In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Categorical
import numpy as np

In [10]:
train_df = pd.read_csv("archive/train.csv")
test_df = pd.read_csv("archive/test.csv")

In [11]:
def prepare_house_data(dataset: pd.DataFrame) -> pd.DataFrame:
    datasetCopy = dataset.copy()

    # --- Tratamento de Valores Ausentes (NaNs) ---

    # NaNs que significam "None" (Ausência de Característica)
    cols_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                 'BsmtFinType2', 'MasVnrType', 'MSSubClass']
    for col in cols_none:
        if col in datasetCopy.columns:
            datasetCopy[col] = datasetCopy[col].fillna('None')

    # NaNs que significam 0 (Áreas/Quantidades)
    cols_zero = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                 'TotalBsmtSF', 'GarageCars', 'GarageArea', 'BsmtFullBath', 'BsmtHalfBath']
    for col in cols_zero:
        if col in datasetCopy.columns:
            datasetCopy[col] = datasetCopy[col].fillna(0)

    # Tratamento específico para LotFrontage usando mediana do Neighborhood
    if 'LotFrontage' in datasetCopy.columns and 'Neighborhood' in datasetCopy.columns:
        datasetCopy['LotFrontage'] = datasetCopy.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

    # NaNs remanescentes (Moda para categóricas, Mediana para numéricas)
    for col in ('GarageYrBlt', 'Electrical', 'MSZoning', 'Utilities', 'Exterior1st',
                'Exterior2nd', 'KitchenQual', 'SaleType', 'Functional'):
        if col in datasetCopy.columns and datasetCopy[col].isnull().any():
            if datasetCopy[col].dtype == 'object':
                datasetCopy[col] = datasetCopy[col].fillna(datasetCopy[col].mode()[0])
            else:
                datasetCopy[col] = datasetCopy[col].fillna(datasetCopy[col].median())

    # Removendo 'Utilities' se não houver variância
    if 'Utilities' in datasetCopy.columns and datasetCopy['Utilities'].nunique() < 2:
         datasetCopy = datasetCopy.drop('Utilities', axis=1)

    # --- Feature Engineering ---

    # Área Total Útil
    if all(col in datasetCopy.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
        datasetCopy['TotalSF'] = datasetCopy['TotalBsmtSF'] + datasetCopy['1stFlrSF'] + datasetCopy['2ndFlrSF']

    # Idade da casa e remodelação recente
    if all(col in datasetCopy.columns for col in ['YearBuilt', 'YearRemodAdd', 'YrSold']):
        datasetCopy['Age'] = datasetCopy['YrSold'] - datasetCopy['YearBuilt']
        datasetCopy['RecentRemodel'] = (datasetCopy['YrSold'] - datasetCopy['YearRemodAdd']).apply(lambda x: 1 if x <= 5 else 0)

    # Transformação Logarítmica da variável alvo (se presente)
    if 'SalePrice' in datasetCopy.columns:
        datasetCopy['SalePrice'] = np.log1p(datasetCopy['SalePrice'])

    # Convertendo MSSubClass (categórica) para string
    if 'MSSubClass' in datasetCopy.columns:
        datasetCopy['MSSubClass'] = datasetCopy['MSSubClass'].astype(str)

    return datasetCopy


In [None]:
prepare_house_data(train_df)
prepare_house_data(test_df)

print(train_df)

In [12]:
y_train = train_df['SalePrice']
X_train_raw = train_df.drop(columns=['SalePrice', 'Id'], axis=1)

test_id = test_df['Id']
X_test_raw = test_df.drop(columns=['Id'], axis=1)

X_train_processed = prepare_house_data(X_train_raw)
X_test_processed = prepare_house_data(X_test_raw)

X_train_full = prepare_house_data(train_df)
X_test_full = prepare_house_data(test_df)


y_train = X_train_full['SalePrice']
X_train = X_train_full.drop(columns=['SalePrice', 'Id'], axis=1)

test_id = X_test_full['Id']
X_test = X_test_full.drop(columns=['Id'], axis=1)

In [44]:
from skopt.space import Real, Categorical, Integer

space = [
    Integer(100, 500, name='n_estimators'),
    Integer(1, 100, name='max_depth'),
    Categorical(['gini', 'entropy', 'log_loss'], name='criterion'),
    Integer(1, 100, name='min_samples_leaf'),
    Integer(2, 100, name='max_leaf_nodes'),
    Categorical([True, False], name='bootstrap')
]

def lr_predict(params):
    n_estimators, max_depth, criterion, min_samples_leaf, max_leaf_nodes, bootstrap = params

    try:
        model_lr = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            criterion=criterion,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            bootstrap=bootstrap,
            random_state=42
        )
        score = cross_val_score(model_lr, X_train, y_train, cv=5, scoring='accuracy')
        return -score.mean()
    except Exception as e: # Added exception handling to see the error if it occurs again
        print(f"Error during cross-validation: {e}")
        return np.inf

In [45]:

# Execução da otimização
result = gp_minimize(
    func=lr_predict,
    dimensions=space,
    n_calls=50,
    random_state=42
)

# Resultados
print("Melhores parâmetros encontrados:")
print(f"n_estimators: {result.x[0]}")
print(f"max_depth: {result.x[1]}")
print(f"criterion: {result.x[2]}")
print(f"min_samples_leaf: {result.x[3]}")
print(f"max_leaf_nodes: {result.x[4]}")
print(f"bootstrap: {result.x[5]}")

Melhores parâmetros encontrados:
n_estimators: 131
max_depth: 47
criterion: entropy
min_samples_leaf: 4
max_leaf_nodes: 57
bootstrap: True


In [46]:
from sklearn.metrics import mean_squared_error # Import mean_squared_error

model_lr = RandomForestClassifier(n_estimators=result.x[0], max_depth=result.x[1], criterion=result.x[2], min_samples_leaf=result.x[3], max_leaf_nodes=result.x[4], bootstrap=result.x[5], random_state=42)
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
mse = mean_squared_error(y_train, model_lr.predict(X_train))
print("Mean Squared Error on training data:", (mse*100))
print("Training Accuracy:", model_lr.score(X_train, y_train)*100)

Mean Squared Error on training data: 13.468013468013467
Training Accuracy: 86.53198653198653
