In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train['DataType'] = 'train'
test['DataType'] = 'test'

combined = pd.concat([train, test], axis=0)


In [4]:
# Columns to drop
drop_cols = [
    'Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature',
    'Utilities', 'Condition2', 'MiscVal', 'MoSold', 'YrSold'
]

In [5]:
combined.drop(columns=drop_cols, inplace=True)


In [6]:
combined.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,SaleType,SaleCondition,SalePrice,DataType
0,60,RL,65.0,8450,Pave,Reg,Lvl,Inside,Gtl,CollgCr,...,0,61,0,0,0,0,WD,Normal,208500.0,train
1,20,RL,80.0,9600,Pave,Reg,Lvl,FR2,Gtl,Veenker,...,298,0,0,0,0,0,WD,Normal,181500.0,train
2,60,RL,68.0,11250,Pave,IR1,Lvl,Inside,Gtl,CollgCr,...,0,42,0,0,0,0,WD,Normal,223500.0,train
3,70,RL,60.0,9550,Pave,IR1,Lvl,Corner,Gtl,Crawfor,...,0,35,272,0,0,0,WD,Abnorml,140000.0,train
4,60,RL,84.0,14260,Pave,IR1,Lvl,FR2,Gtl,NoRidge,...,192,84,0,0,0,0,WD,Normal,250000.0,train


In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Add identifier
train['DataType'] = 'train'
test['DataType'] = 'test'

# Combine for preprocessing
combined = pd.concat([train, test], axis=0)

# Drop unnecessary columns
drop_cols = [
    'Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature',
    'Utilities', 'Condition2', 'MiscVal', 'MoSold', 'YrSold'
]
combined.drop(columns=drop_cols, inplace=True)

# Separate numerical and categorical columns
num_cols = combined.select_dtypes(include=['float64', 'int64']).columns
cat_cols = combined.select_dtypes(include=['object']).columns

# Convert categorical to numerical using Label Encoding
label_encoder = LabelEncoder()
for col in cat_cols:
    combined[col] = combined[col].astype(str)  # Ensure all are strings
    combined[col] = label_encoder.fit_transform(combined[col])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
combined[num_cols] = imputer.fit_transform(combined[num_cols])

print("Categorical columns converted to numeric and missing values handled.")


Categorical columns converted to numeric and missing values handled.


In [13]:
# Split back into train and test
train_processed = combined[combined['DataType'] == label_encoder.transform(['train'])[0]].copy()
test_processed = combined[combined['DataType'] == label_encoder.transform(['test'])[0]].copy()

# Drop 'DataType' column from both
train_processed.drop(columns=['DataType'], inplace=True)
test_processed.drop(columns=['DataType'], inplace=True)


In [14]:
X_train = train_processed.drop(columns=['SalePrice'])
y_train = train_processed['SalePrice']


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


In [16]:
X_test = test_processed.drop(columns=['SalePrice'], errors='ignore')  # 'SalePrice' not present in test
predictions = model.predict(X_test)


In [17]:
submission = pd.DataFrame({
    'Id': test['Id'],  # use original test Id
    'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)


In [23]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# Custom RMSE scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Add identifier
train['DataType'] = 'train'
test['DataType'] = 'test'
test['SalePrice'] = np.nan  # Add dummy target to align columns

# Combine train and test
combined = pd.concat([train, test], axis=0)

# Drop unnecessary columns
drop_cols = ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature',
             'Utilities', 'Condition2', 'MiscVal', 'MoSold', 'YrSold']
combined.drop(columns=drop_cols, inplace=True)

# Encode categorical features
cat_cols = combined.select_dtypes(include=['object']).columns
for col in cat_cols:
    combined[col] = combined[col].astype(str)
    combined[col] = LabelEncoder().fit_transform(combined[col])

# Impute missing values (excluding SalePrice)
num_cols = combined.select_dtypes(include=['float64', 'int64']).columns
num_cols = num_cols.drop('SalePrice')  # Don't touch the target
imputer = SimpleImputer(strategy='mean')
combined[num_cols] = imputer.fit_transform(combined[num_cols])

# Split combined data
train_data = combined[combined['DataType'] == 1].copy()  # 1 = 'train' after LabelEncoder
test_data = combined[combined['DataType'] == 0].drop(['SalePrice', 'DataType'], axis=1)

# Prepare final train set
X = train_data.drop(['SalePrice', 'DataType'], axis=1)
y = train_data['SalePrice']
y_log = np.log1p(y)  # Optional log transform

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("Model Performance (Cross-Validated RMSE):\n")
for name, model in models.items():
    scores = cross_val_score(model, X, y_log, cv=kf, scoring=rmse_scorer)
    mean_rmse = -1 * np.mean(scores)
    print(f"{name}: RMSE = {mean_rmse:.5f}")


Model Performance (Cross-Validated RMSE):

LinearRegression: RMSE = 0.15844
Ridge: RMSE = 0.15828
Lasso: RMSE = 0.19818
RandomForest: RMSE = 0.14457
GradientBoosting: RMSE = 0.13413
XGBoost: RMSE = 0.14432
