<a href="https://colab.research.google.com/github/julioclerio/svm_regressor/blob/main/notebooks/ml_svm_regressao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor


In [2]:
def process_data():
  df = pd.read_csv('/content/house_prices.csv')
  X = df.drop(columns=["SalePrice", "Id"])
  y = df["SalePrice"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3,  random_state=42)
  categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
  numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

  # Criação de pipelines para pré-processamento
  numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
  ])


  categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


  # Transformador de coluna para processamento conjunto
  pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
  )

# Aplicação do pré-processador nos conjuntos de treino e teste
  X_train_processed = pre_processor.fit_transform(X_train)
  X_test_processed = pre_processor.transform(X_test)

  return X_train, X_test, y_train, y_test, pre_processor

In [3]:
# função para avaliar o modelo
def metricas_regressao(estimator):
    # predições de treino
    y_train_pred = estimator.predict(X_train)
    print("\nMétricas de avaliação de treino")
    print(f"R2: { r2_score(y_train, y_train_pred):.2f}")
    print(f"MAE: { mean_absolute_error(y_train, y_train_pred):.2f}")
    print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_train_pred)):.2f}")

    # predições de teste
    y_pred_test = estimator.predict(X_test)

    print("\nMétricas de avaliação de teste")
    print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
    print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
    print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

In [4]:
# chamando a função de preprocessamento
X_train, X_test, y_train, y_test, pre_processador = process_data()

pipe_svr = Pipeline([('pre_process', pre_processador),("svr", SVR())])

pipe_svr.fit(X_train, y_train)

In [5]:
# Avaliar o modelo
metricas_regressao(pipe_svr)


Métricas de avaliação de treino
R2: -0.04
MAE: 54871.67
RMSE: 79241.56

Métricas de avaliação de teste
R2: -0.03
MAE: 57003.06
RMSE: 84837.37


In [6]:
def process_data():
    df = pd.read_csv('/content/house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

# Criando o pipeline com SVR
pipe_svr = Pipeline([('pre_process', pre_processor), ("svr", SVR())])

# Ajustando o modelo
pipe_svr.fit(X_train, y_train)

param_grid = {
    'svr__kernel': ['linear', 'poly', 'rbf'],
    'svr__C': [0.1, 1, 10],
    'svr__gamma': [0.1, 1, 'auto'],
    'svr__epsilon': [0.1, 0.2, 0.3]
}

# Configurando e executando o Grid Search
grid_search = GridSearchCV(estimator=pipe_svr, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")


Melhores Parâmetros: {'svr__C': 1, 'svr__epsilon': 0.3, 'svr__gamma': 1, 'svr__kernel': 'poly'}
Score no conjunto de teste: 0.8538014169060849
Resultados do conjunto de treino:
R2: 1.00
MAE: 354.54
RMSE: 4366.59

Resultados do conjunto de teste:
R2: 0.85
MAE: 16681.52
RMSE: 31940.37


ENCONTRAR MODELO QUE TRAGA MELHOR RESULTADO


In [22]:
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score , classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


In [None]:
matriz_correlacao = df.corr()
matriz_correlacao

In [None]:
matriz_correlacao[matriz_correlacao > 0.8]

RANDOM FOREST REGRESSOR

In [27]:
def process_data():
    df = pd.read_csv('/content/house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

rf = RandomForestRegressor(random_state=42)

# Criando o pipeline com SVR
pipe_rf = Pipeline([('pre_process', pre_processor),
                    ("rf", rf)])


parametros_rf = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 5, 10, 15]
}



grid_search_rf = GridSearchCV(estimator = pipe_rf,
                           param_grid = parametros_rf,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_rf.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params = grid_search_rf.best_params_
best_model = grid_search_rf.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

Melhores Parâmetros: {'rf__max_depth': 15, 'rf__n_estimators': 300}
Score no conjunto de teste: 0.8959630609050895
Resultados do conjunto de treino:
R2: 0.98
MAE: 6791.89
RMSE: 11881.16

Resultados do conjunto de teste:
R2: 0.90
MAE: 16785.19
RMSE: 26944.01


DECISION TREE


In [28]:
from sklearn.tree import DecisionTreeRegressor

def process_data():
    df = pd.read_csv('/content/house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

dt = DecisionTreeRegressor(random_state=42)

# Criando o pipeline com SVR
pipe_dt = Pipeline([('pre_process', pre_processor),
                    ("dt", dt)])


parametros_dt = {
    'dt__max_depth': [None, 5, 10, 15],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator = pipe_dt,
                           param_grid = parametros_dt,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_dt.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params = grid_search_dt.best_params_
best_model = grid_search_dt.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params}")
print(f"Score no conjunto de teste: {test_score}")

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)


# Exibindo os resultados
print("Resultados do conjunto de treino:")
print(f"R2: { r2_score(y_train, y_pred_train):.2f}")
print(f"MAE: { mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f}")

print("\nResultados do conjunto de teste:")
print(f"R2: { r2_score(y_test, y_pred_test):.2f}")
print(f"MAE: { mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"RMSE: { np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")

Melhores Parâmetros: {'dt__max_depth': 5, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5}
Score no conjunto de teste: 0.7743361226474328
Resultados do conjunto de treino:
R2: 0.85
MAE: 22303.28
RMSE: 30399.22

Resultados do conjunto de teste:
R2: 0.77
MAE: 27370.83
RMSE: 39682.54


LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

def process_data():
    df = pd.read_csv('/content/house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

log_reg = LogisticRegression()

# Criando o pipeline com SVR
pipe_log_reg = Pipeline([('pre_process', pre_processor),
                    ("log_reg", log_reg)])


param_grid_log_reg = {
    'log_reg__C': [0.001, 0.01, 0.1, 1],  # Regularization parameter
    'log_reg__penalty': ['l1', 'l2'],  # Penalty (L1 or L2)
    'log_reg__max_iter': [100, 200, 300],  # Maximum iterations
}

grid_search_log_reg = GridSearchCV(estimator = pipe_log_reg,
                           param_grid = parametros_log_reg,
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_log_reg.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params_log_reg = grid_search_log_reg.best_params_
best_model_log_reg = grid_search_log_reg.best_estimator_

# Avaliando o modelo final no conjunto de teste
test_score = best_model_log_reg.score(X_test, y_test)
print(f"Melhores Parâmetros: {best_params_log_reg}")

# Avaliando o modelo final no conjunto de teste
y_pred_train_log_reg = best_model_log_reg.predict(X_train)
y_pred_test_log_reg = best_model_log_reg.predict(X_test)

# Métricas de avaliação para regressão
mse_train_log_reg = mean_squared_error(y_train, y_pred_train_log_reg)
r2_train_log_reg = r2_score(y_train, y_pred_train_log_reg)
mae_train_log_reg = mean_absolute_error(y_train, y_pred_train_log_reg)

mse_test_log_reg = mean_squared_error(y_test, y_pred_test_log_reg)
r2_test_log_reg = r2_score(y_test, y_pred_test_log_reg)
mae_test_log_reg = mean_absolute_error(y_test, y_pred_test_log_reg)

print("Métricas para o conjunto de treino:")
print(f"MSE: {mse_train_log_reg:.4f}")
print(f"R²: {r2_train_log_reg:.4f}")
print(f"MAE: {mae_train_log_reg:.4f}")

print("\nMétricas para o conjunto de teste:")
print(f"MSE: {mse_test_log_reg:.4f}")
print(f"R²: {r2_test_log_reg:.4f}")
print(f"MAE: {mae_test_log_reg:.4f}")



KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def process_data():
    df = pd.read_csv('/content/house_prices.csv')
    X = df.drop(columns=["SalePrice", "Id"])
    y = df["SalePrice"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

    # Criação de pipelines para pré-processamento
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Transformador de coluna para processamento conjunto
    pre_processor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Aplicação do pré-processador nos conjuntos de treino e teste
    X_train_processed = pre_processor.fit_transform(X_train)
    X_test_processed = pre_processor.transform(X_test)

    return X_train, X_test, y_train, y_test, pre_processor

X_train, X_test, y_train, y_test, pre_processor = process_data()

knn = KNeighborsRegressor()

# Criando o pipeline com SVR
pipe_knn = Pipeline([('pre_process', pre_processor),
                    ("knn", knn)])


parametros_knn = {
    'knn__n_neighbors': [3, 5, 7],  # Número de vizinhos
    'knn__weights': ['uniform', 'distance'],  # Peso dos vizinhos
    'knn__p': [1, 2]  # Parâmetro de distância (1 para Manhattan, 2 para Euclidiana)
}

grid_search_knn = GridSearchCV(estimator = pipe_knn,
                           param_grid = parametros_knn,
                           cv = 5,
                           scoring ='neg_mean_squared_error',
                           n_jobs = -1)


# Configurando e executando o Grid Search
grid_search_knn.fit(X_train, y_train)

# Obtendo os melhores parâmetros e o melhor modelo
best_params_knn = grid_search_knn.best_params_
best_model_knn = grid_search_knn.best_estimator_

# Avaliando o modelo final no conjunto de teste
y_pred_train = best_model_knn.predict(X_train)
y_pred_test = best_model_knn.predict(X_test)

# Métricas de avaliação
mse_train_knn_reg = mean_squared_error(y_train, y_pred_train)
r2_train_knn_reg = r2_score(y_train, y_pred_train)
mae_train_knn_reg = mean_absolute_error(y_train, y_pred_train)

mse_test_knn_reg = mean_squared_error(y_test, y_pred_test)
r2_test_knn_reg = r2_score(y_test, y_pred_test)
mae_test_knn_reg = mean_absolute_error(y_test, y_pred_test)

print("Métricas para o conjunto de treino:")
print(f"MSE: {mse_train_knn_reg:.4f}")
print(f"R²: {r2_train_knn_reg:.4f}")
print(f"MAE: {mae_train_knn_reg:.4f}")

print("\nMétricas para o conjunto de teste:")
print(f"MSE: {mse_test_knn_reg:.4f}")
print(f"R²: {r2_test_knn_reg:.4f}")
print(f"MAE: {mae_test_knn_reg:.4f}")