In [41]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [42]:
# Load the training dataset
train_data = pd.read_csv('hpd/train.csv')

# Load the testing dataset
test_data = pd.read_csv('hpd/test.csv')

In [43]:
# Display the first few rows of the training dataset
train_data.head()

# Check for missing values in the training dataset
train_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [44]:
# Drop columns with a large number of missing values or those not relevant for prediction in the training dataset
train_data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [45]:
# Fill missing values with the mean for numerical columns and mode for categorical columns in the training dataset
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
    else:
        train_data[column].fillna(train_data[column].mean(), inplace=True)

# One-hot encode categorical variables in the training dataset
train_data = pd.get_dummies(train_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[column].fillna(train_data[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[column].fillna(train_data[column].mode()[0], inplace=True)


In [46]:
# Separate features (X_train) and target variable (y_train) in the training dataset
X_train = train_data.drop(columns=['SalePrice'])
y_train = train_data['SalePrice']

In [47]:
# Drop columns not relevant for prediction in the testing dataset
test_data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [48]:
# Fill missing values with the mean for numerical columns and mode for categorical columns in the testing dataset
for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data[column].fillna(test_data[column].mode()[0], inplace=True)
    else:
        test_data[column].fillna(test_data[column].mean(), inplace=True)
# One-hot encode categorical variables in the testing dataset
test_data = pd.get_dummies(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[column].fillna(test_data[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[column].fillna(test_data[column].mode()[0], inplace=True)


In [49]:
# Initialize the linear regression model
model = LinearRegression()

In [50]:
# Train the model using the training dataset
model.fit(X_train, y_train)

In [51]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the testing dataset
test_data = pd.read_csv('hpd/test.csv')

# Print the columns of the testing dataset to check for 'SalePrice'
print(test_data.columns)

# Drop the 'SalePrice' column from the testing dataset if it exists
if 'SalePrice' in test_data.columns:
    test_data.drop(columns=['SalePrice'], inplace=True)

# Ensure the testing dataset has the same columns as the training dataset after one-hot encoding
test_data_aligned = pd.get_dummies(test_data).reindex(columns=X_train.columns, fill_value=0)

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data_aligned), columns=test_data_aligned.columns)

# Make predictions on the aligned testing dataset with imputed missing values
y_pred = model.predict(test_data_imputed)

# Create a DataFrame with the predicted SalePrice
predictions = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_pred})

# Save the predictions to a CSV file
predictions.to_csv('hpd/predictions.csv', index=False)


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive