In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xgb
import math

data = pd.read_csv('train_split.csv')
X, y = data.drop(columns=['Unnamed: 0','Id','SalePrice']), data['SalePrice'].apply(math.log10)

X['CentralAir'] = X['CentralAir'].apply(lambda x: 1 if x=='Y' else 0)
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Add age features
X['HouseAge'] = 2023 - X['YearBuilt']
X['RemodelAge'] = 2023 - X['YearRemodAdd']

# List of categorical columns
categorical_columns = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
    'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
    'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
    'Electrical', 'Functional', 'GarageType', 'MiscFeature', 'SaleType',
    'SaleCondition',  'MSSubClass'
]

# List of ordinal columns (ranked data)
ordinal_columns = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual',
    'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
    'PoolQC', 'Fence', 'CentralAir'
]

# List of numerical columns
numerical_columns = [
    'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
    'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
    'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
    'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
]

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder

# Sample ordinal mapping (you can adjust based on the actual dataset)
ordinal_mapping = [
    ('ExterQual', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('ExterCond', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('BsmtQual', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('BsmtCond', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('BsmtExposure', ['No', 'Mn', 'Av', 'Gd', 'NaN']),
    ('BsmtFinType1', ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ', 'NaN']),
    ('HeatingQC', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('KitchenQual', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('FireplaceQu', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('GarageFinish', ['Unf', 'RFn', 'Fin', 'NaN']),
    ('GarageQual', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('GarageCond', ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('PavedDrive', ['N', 'P', 'Y', 'NaN']),
    ('PoolQC', ['Fa', 'TA', 'Gd', 'Ex', 'NaN']),
    ('Fence', ['MnWw', 'GdWo', 'MnPrv', 'GdPrv', 'NaN'])
]


# Create an OrdinalEncoder with predefined mapping
ordinal_encoder = OrdinalEncoder(categories=[m[1] for m in ordinal_mapping], handle_unknown='ignore')

# Define pipelines for each type of column
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[m[1] for m in ordinal_mapping]))
])

numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

binary_pipeline = Pipeline(steps=[
    ('label', LabelEncoder())  # CentralAir: Yes/No to 1/0
])

special_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Special case like MSSubClass
])

# Define the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_pipeline, categorical_columns),
    ('ord', ordinal_pipeline, [m[0] for m in ordinal_mapping]),
    ('num', numerical_pipeline, numerical_columns),
    # ('bin', 'passthrough', binary_columns),  # LabelEncoder will be applied separately for binary
])


# Fill NAs

X_train[ordinal_columns] = X_train[ordinal_columns].fillna('NaN')
X_train[categorical_columns] = X_train[categorical_columns].fillna('-1')
X_train[numerical_columns] = X_train[numerical_columns].fillna(X_train[numerical_columns].mean())

# Apply transformations
X_preprocessed = preprocessor.fit_transform(X_train)

reg = xgb.XGBRegressor()
# Fit the model, test sets are used for early stopping.
reg.fit(X_preprocessed, y_train)

# Fill NAs for X_val

X_val[ordinal_columns] = X_val[ordinal_columns].fillna('NaN')
X_val[categorical_columns] = X_val[categorical_columns].fillna('-1')
X_val[numerical_columns] = X_val[numerical_columns].fillna(X_train[numerical_columns].mean()) # Note that X_val shall be inputed with X_train's mean, since this X_val's own mean is not known on inference time

X_val_preprocessed = preprocessor.transform(X_val)

y_pred = reg.predict(X_val_preprocessed)


from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score

## Score

mae = mean_absolute_error(y_val.to_numpy(), y_pred)
mse = mean_squared_error(y_val.to_numpy(), y_pred)
rmse = root_mean_squared_error(y_val.to_numpy(), y_pred)
r2 = r2_score(y_val.to_numpy(), y_pred)

print('ERROR value:', mae, mse, rmse, r2)

### Inference

test_data = pd.read_csv('test_data.csv')

test_data = test_data.drop(columns=['Unnamed: 0','Id'])
test_data['CentralAir'] = test_data['CentralAir'].apply(lambda x: 1 if x=='Y' else 0)

# Fill NAs

test_data[ordinal_columns] = test_data[ordinal_columns].fillna('NaN')
test_data[categorical_columns] = test_data[categorical_columns].fillna('-1')
test_data[numerical_columns] = test_data[numerical_columns].fillna(X_train[numerical_columns].mean())

# Apply transformations
X_test_preprocessed = preprocessor.transform(test_data)

y_test_pred = reg.predict(X_test_preprocessed)
y_test_pred = 10**y_test_pred # Correcting the log training

Y_test_pred = pd.DataFrame(y_test_pred, columns=['SalePrice']).reset_index().rename(columns={'index': 'Id'})
Y_test_pred['Id'] = Y_test_pred['Id'] + 1100
Y_test_pred.to_csv('predictions.csv', index=False)

ValueError: could not convert string to float: 'RL'

In [12]:
# Feature selection using RandomForest
selector = SelectFromModel(RandomForestRegressor(n_estimators=100))
X_selected = selector.fit_transform(X, y)

print(X_selected)

ValueError: could not convert string to float: 'RL'

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load your data into a DataFrame
# data = pd.read_csv('train_split.csv')

# Step 2: Calculate the correlation matrix
correlation_matrix = data[numerical_columns].corr()

# Step 3: Print the correlation matrix
print(correlation_matrix)

# Step 4 (Optional): Visualize the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

KeyError: "['HouseAge', 'RemodelAge'] not in index"

In [17]:
print(X_val_preprocessed)

[[ 0.          0.          0.         ... -0.08672377 -0.48380383
   0.11566043]
 [ 0.          0.          0.         ... -0.08672377  0.2629467
   0.11566043]
 [ 0.          0.          0.         ... -0.08672377  1.38307249
   0.86699789]
 ...
 [ 0.          0.          0.         ... -0.08672377  0.2629467
   0.86699789]
 [ 0.          0.          0.         ... -0.08672377 -0.48380383
   0.86699789]
 [ 0.          0.          0.         ... -0.08672377 -0.11042856
  -1.38701447]]
