In [9]:
# Install lightgbm if not installed
# !pip install lightgbm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

# 1. Load data
train = pd.read_csv(r'C:\Users\HP\Downloads\train.csv')
test = pd.read_csv(r'C:\Users\HP\Downloads\test (1).csv')

# Save test IDs for submission file
test_ids = test['Id']

# 2. Target and features
y = train['SalePrice']
train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# 3. Combine train and test for consistent preprocessing
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

# 4. Fill missing values (simple but effective)
for col in all_data.columns:
    if all_data[col].dtype == 'object':
        all_data[col] = all_data[col].fillna('Missing')
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

# 5. Feature engineering: Create new features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Age'] = 2025 - all_data['YearBuilt']  # Use current year or dataset year
all_data['RemodAge'] = 2025 - all_data['YearRemodAdd']

# Drop columns that might be redundant or leak information if needed
# (Optional, you can try with/without)
# all_data.drop(['YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

# 6. Split back into train/test
X_train = all_data.iloc[:len(y), :]
X_test = all_data.iloc[len(y):, :]

# 7. Identify categorical and numerical features
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# 8. One-hot encode categorical variables using ColumnTransformer and Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'  # keep numerical features as is
)

# 9. Define LightGBM Regressor
model = lgb.LGBMRegressor(random_state=42)

# 10. Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
])

# 11. Cross-validation (5-fold) with RMSE scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model):
    rmse = -cross_val_score(model, X_train, y, scoring="neg_root_mean_squared_error", cv=kf)
    return rmse

scores = rmse_cv(pipeline)
print(f"CV RMSE scores: {scores}")
print(f"Mean CV RMSE: {scores.mean()}")

# 12. Hyperparameter tuning with RandomizedSearchCV (basic example)
param_dist = {
    'regressor__n_estimators': [100, 300, 500, 1000],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__num_leaves': [31, 50, 100],
    'regressor__max_depth': [-1, 10, 20],
    'regressor__min_child_samples': [10, 20, 30],
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y)
print(f"Best RMSE: {-random_search.best_score_}")
print(f"Best params: {random_search.best_params_}")

# 13. Train final model on full train data with best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y)

# 14. Predict on test data
predictions = best_model.predict(X_test)

# 15. Prepare submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")




ModuleNotFoundError: No module named 'lightgbm'

In [11]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   --------------------- ------------------ 0.8/1.5 MB 2.6 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 2.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.2 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [13]:
# Install lightgbm if not installed
# !pip install lightgbm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

# 1. Load data
train = pd.read_csv(r'C:\Users\HP\Downloads\train.csv')
test = pd.read_csv(r'C:\Users\HP\Downloads\test (1).csv')
# Save test IDs for submission file
test_ids = test['Id']

# 2. Target and features
y = train['SalePrice']
train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# 3. Combine train and test for consistent preprocessing
all_data = pd.concat([train, test], sort=False).reset_index(drop=True)

# 4. Fill missing values (simple but effective)
for col in all_data.columns:
    if all_data[col].dtype == 'object':
        all_data[col] = all_data[col].fillna('Missing')
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

# 5. Feature engineering: Create new features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Age'] = 2025 - all_data['YearBuilt']  # Use current year or dataset year
all_data['RemodAge'] = 2025 - all_data['YearRemodAdd']

# Drop columns that might be redundant or leak information if needed
# (Optional, you can try with/without)
# all_data.drop(['YearBuilt', 'YearRemodAdd'], axis=1, inplace=True)

# 6. Split back into train/test
X_train = all_data.iloc[:len(y), :]
X_test = all_data.iloc[len(y):, :]

# 7. Identify categorical and numerical features
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# 8. One-hot encode categorical variables using ColumnTransformer and Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'  # keep numerical features as is
)

# 9. Define LightGBM Regressor
model = lgb.LGBMRegressor(random_state=42)

# 10. Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
])

# 11. Cross-validation (5-fold) with RMSE scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model):
    rmse = -cross_val_score(model, X_train, y, scoring="neg_root_mean_squared_error", cv=kf)
    return rmse

scores = rmse_cv(pipeline)
print(f"CV RMSE scores: {scores}")
print(f"Mean CV RMSE: {scores.mean()}")

# 12. Hyperparameter tuning with RandomizedSearchCV (basic example)
param_dist = {
    'regressor__n_estimators': [100, 300, 500, 1000],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__num_leaves': [31, 50, 100],
    'regressor__max_depth': [-1, 10, 20],
    'regressor__min_child_samples': [10, 20, 30],
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y)
print(f"Best RMSE: {-random_search.best_score_}")
print(f"Best params: {random_search.best_params_}")

# 13. Train final model on full train data with best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y)

# 14. Predict on test data
predictions = best_model.predict(X_test)

# 15. Prepare submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3630
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 196
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3631
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 199
[LightGBM] [Info] Start training from score 179651.292808
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

In [15]:
import os
print("Current working directory:", os.getcwd())
print("Full path to submission.csv:", os.path.abspath("submission.csv"))



Current working directory: C:\Users\HP
Full path to submission.csv: C:\Users\HP\submission.csv
