In [10]:
import pandas as pd  

# Load training and test data
train = pd.read_csv("data/train.csv",index_col = 'Id')  
test = pd.read_csv("data/test.csv", index_col = 'Id')  

# Define target variable (y)
y = train["SalePrice"]

# Define features (X) - Exclude 'SalePrice'
X = train.drop(columns=["SalePrice"])
X_test = test.copy()  # Make a copy for consistency

# Display the first few rows
print(train.head())  

    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave   NaN      Reg   
2           20       RL         80.0     9600   Pave   NaN      Reg   
3           60       RL         68.0    11250   Pave   NaN      IR1   
4           70       RL         60.0     9550   Pave   NaN      IR1   
5           60       RL         84.0    14260   Pave   NaN      IR1   

   LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
Id                                  ...                                     
1          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
2          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
3          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
4          Lvl    AllPub    Corner  ...        0    NaN   NaN         NaN   
5          Lvl    AllPub       FR2  ... 

In [11]:
print(train.isnull().sum())
test.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64


MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 79, dtype: int64

In [12]:
null = train.columns[train.isnull().any()]
null_v = train[null].isnull().sum()
null_v

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [13]:
from sklearn.impute import SimpleImputer

# Separate numerical and categorical features
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Imputer for numerical features (median)
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# Imputer for categorical features (most frequent)
cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

In [14]:
from sklearn.preprocessing import OneHotEncoder

# Apply One-Hot Encoding to categorical features
X = pd.get_dummies(X, columns=cat_cols, dtype=int)
X_test = pd.get_dummies(X_test, columns=cat_cols, dtype=int)

# Align train and test data (handle missing dummy columns)
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Split the training data for validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_valid)

# Evaluate performance
mae = mean_absolute_error(y_valid, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

Mean Absolute Error: 17606.81


In [18]:
# Get predictions with test data
test_predictions = model.predict(X_test)

# Save test predictions 

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)
output.shape

(1459, 2)