In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn import set_config
set_config(display="diagram")
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

house = pd.read_csv('https://raw.githubusercontent.com/icaromisquita/archives/main/train.csv')

X = house.drop(columns=['Id','SalePrice'])
y = house['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# numeric cols
num_cols = X_train.select_dtypes(exclude=['object']).columns

# ordinal cols
ord_cols = ['ExterQual','ExterCond','KitchenQual','FireplaceQu']
qualities = ["Po", "Fa", "TA", "Gd", "Ex"]
na_qualities = ["missing","Po", "Fa", "TA", "Gd", "Ex"]

# nominal cols
nom_cols = (
    X_train
    .drop(columns=ord_cols)
    .select_dtypes(include=['object'])
    .columns)


# numeric pipeline
num_pipe = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

# ordinal pipeline
ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('encoder', OrdinalEncoder())
])

# nominal pipeline
nom_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder())
])

# preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
        ('num', num_pipe, num_cols),
        ('nom', nom_pipe, nom_cols),
        ('ord', ord_pipe, ord_cols)
])

# model pipeline
model_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_threshold', VarianceThreshold()),
    ('mod_feature_sel', SelectFromModel(DecisionTreeRegressor())),
    ('model', RandomForestClassifier())
])


# grid search parameters
param_grid = {
    'preprocessor__num__scaler__with_mean': [True, False],
    'preprocessor__num__scaler__with_std': [True, False],
    "preprocessor__num__imputer__n_neighbors": range(5, 20,5),
    "preprocessor__num__imputer__weights": ["uniform","distance"],

    'preprocessor__ord__imputer__strategy': ['most_frequent', 'constant'],
    'preprocessor__ord__imputer__fill_value': ['missing'],
    'preprocessor__ord__encoder__categories': [[qualities for l in range(3)] + [na_qualities]],
    'preprocessor__nom__imputer__strategy': ['most_frequent', 'constant'],
    'preprocessor__nom__onehot__handle_unknown': ['ignore'],
    'var_threshold__threshold': [0, 0.01, 0.02],
    'mod_feature_sel__estimator__max_features': [None, 'auto', 'sqrt', 'log2'],
    'mod_feature_sel__estimator__min_samples_leaf': [1, 2, 4],
    

    "model__n_estimators": range(100, 500, 50),
    "model__criterion": ["gini", "entropy", "log_loss"],
    "model__max_depth": range(4,10,2),
    "model__min_samples_split": range(4,10,2),
    "model__min_samples_leaf": range(2,10,2),
}

# grid search + cross validation
grid_search = GridSearchCV(model_pipe, param_grid, cv=5, n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)



In [None]:
# get predictions
predictions = grid_search.predict(X_train)

# evaluate the model
print(f"R2 score: {r2_score(y_train, predictions)}")
print(f"MSE: {mean_squared_error(y_train, predictions)}")
print(f"MAE: {mean_absolute_error(y_train, predictions)}")

In [None]:
# get predictions
predictions = grid_search.predict(X_test)

# evaluate the model
print(f"R2 score: {r2_score(y_test, predictions)}")
print(f"MSE: {mean_squared_error(y_test, predictions)}")
print(f"MAE: {mean_absolute_error(y_test, predictions)}")

##Making the data to upload in the competition

In [None]:
#from google.colab import files
#test = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/archives/main/test.csv")

(test
    .assign(SalePrice = grid_search.predict(test))
    .filter(['Id','SalePrice'])
    .to_csv('./Kaggle_submission1.csv', index=False)
    )
#files.download('Kaggle_submission1.csv')

In [None]:
# get predictions
predictions = grid_search.predict(test)

# evaluate the model
print(f"R2 score: {r2_score(y_train, predictions)}")
print(f"MSE: {mean_squared_error(y_train, predictions)}")
print(f"MAE: {mean_absolute_error(y_train, predictions)}")