In [None]:
#Try to compete on https://www.kaggle.com/c/home-data-for-ml-course

# Nov 3: 17877 score
# Nov 4: 14804 score (rank: 2512)
# Nov 5: 14749 score (rank: 1960)

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer,mean_absolute_error, mean_squared_error,r2_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Load the dataset into a pandas DataFrame
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate the features (X) and target variable (y)
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

# X_test = test_data.copy()

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and
                    X[cname].dtype == "object"]

# # Select numerical columns
numerical_cols = [cname for cname in X.columns if
                  X[cname].dtype in ['int64', 'float64']]

# categorical_cols = ['MSZoning','Street']
# Keep selected columns only
my_cols = categorical_cols + numerical_cols

X = X[my_cols]


In [5]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
# Define a custom scoring function using mean_absolute_error
mae_scorer = make_scorer(mean_absolute_error)

In [7]:
def cross_validation(pipeline):
    # Use cross_validate to perform cross-validation and obtain the evaluation scores
    cv_results = cross_validate(pipeline, X, y, scoring=mae_scorer, cv=StratifiedKFold(n_splits=5))
    
    # Print the mean and standard deviation of the cross-validation scores
    print(f"Mean MAE: {np.mean(cv_results['test_score'])}")
    print(f"Standard Deviation of MAE: {np.std(cv_results['test_score'])}")

In [31]:
# Algo 1: Random Forest 

from sklearn.ensemble import RandomForestRegressor

# Define the regression model
model1 = RandomForestRegressor(n_estimators=500, random_state=0)

# Create the pipeline by combining the preprocessing and modeling steps
pipeline_RFR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model1)
])

cross_validation(pipeline_RFR)




Mean MAE: 17396.10108767123
Standard Deviation of MAE: 834.7824502643747


In [54]:
pipeline_RFR.fit(X,y)
preds1 = pipeline_RFR.predict(test_data)

In [9]:
# Algo 2: XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
model2= GradientBoostingRegressor(n_estimators=500, random_state=0)
my_pipeline_gbm = Pipeline(steps=[('preprocessor', preprocessor),
                               ('gbm', model2)
                               ])

cross_validation(my_pipeline_gbm)




Mean MAE: 15754.382848537376
Standard Deviation of MAE: 843.341079855781


In [10]:
my_pipeline_gbm.fit(X,y)
preds2 =my_pipeline_gbm.predict(test_data)


In [2]:
# Algo 3: XGboost

from xgboost import XGBRegressor

In [8]:
# Define model
model3 = XGBRegressor(n_estimators=1000, learning_rate=0.03, random_state=0)

my_pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('xgb', model3)
                                  ])

cross_validation(my_pipeline_xgb)




Mean MAE: 16719.483085402397
Standard Deviation of MAE: 1168.5578799968175


********************** Output submission file ********************

In [65]:
preds_final = (preds1 + preds2) / 2

# Save test predictions to file
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': preds_final})
output.to_csv('submission_11-5.csv',index=False)

In [11]:

# Save test predictions to file
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': preds2})
output.to_csv('submission_11-5_gb.csv',index=False)