In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv("/content/data/train.csv")

In [None]:
df.info()

In [None]:
import tensorflow as tf

# Check if GPU is available and being used
gpu_available = tf.config.list_physical_devices('GPU')
if gpu_available:
  print("GPU is available:", gpu_available)
else:
  print("GPU is not available. Make sure you have selected a GPU runtime.")

In [None]:
seals_price_high_correlation_features = ["SalePrice","1stFlrSF", "GarageArea", "GarageYrBlt", "GrLivArea", "OverallQual", "TotRmsAbvGrd", "TotalBsmtSF" , "YearBuilt", "YearRemodAdd"]


df = pd.DataFrame(df[seals_price_high_correlation_features] , columns=seals_price_high_correlation_features)
df.info()

In [None]:
df["GarageYrBlt"].fillna(df.GarageYrBlt.mean(), inplace=True)

In [None]:
df.info()

In [None]:
# Import necessary libraries for model training and evaluation
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd # Ensure pandas is imported here as it's used for X and y

# Define features and target
X = df.drop(["SalePrice"] , axis=1)
y = df["SalePrice"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create a pipeline with StandardScaler and XGBRegressor
xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("xgb_model" , xgb.XGBRegressor(random_state=123)) # Added random_state for reproducibility
])

# Define the parameter grid for RandomizedSearchCV
xgb_params_grid = {
    "xgb_model__subsample" : np.arange(0.05, 1, 0.05),
    "xgb_model__max_depth" : np.arange(3, 20, 1),
    "xgb_model__colsample_bytree": np.arange(0.1, 1.05, 0.05)
}

# Set up RandomizedSearchCV
xgb_randomCV = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_params_grid,
    n_iter=25, # Number of parameter settings that are sampled
    cv=5,       # Number of cross-validation folds
    scoring="neg_mean_squared_error", # Scoring metric
    verbose=1,  # Controls the verbosity: higher means more messages
    random_state=123 # Added random_state for reproducibility
)

# Fit RandomizedSearchCV on the training data
xgb_randomCV.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found by RandomizedSearchCV:")
print(xgb_randomCV.best_params_)

# Print the best cross-validation score
print("Best cross-validation score (negative MSE):")
print(xgb_randomCV.best_score_)

In [None]:
print(xgb_randomCV.best_params_)

In [None]:
print(xgb_randomCV.best_score_)

In [None]:
# Import pandas for data manipulation
import pandas as pd

# Load the test data
df_test = pd.read_csv("/content/test.csv")

# Define the features to use for prediction.
# It's crucial that these match the features used during training (X_train).
features_for_prediction = X_train.columns.tolist()

# Select and reorder the features in the test data to match the training data
# This helps prevent feature mismatch errors during prediction.
df_test_features = df_test[features_for_prediction]

# Fill missing values in 'GarageYrBlt' in the test set.
# In a real-world scenario, you should use the mean from the *training* data
# to fill missing values in the test data for consistency.
# Here, for simplicity and to match the previous code's approach,
# we use the mean of the test set's 'GarageYrBlt'.
df_test_features["GarageYrBlt"].fillna(df_test_features["GarageYrBlt"].mean(), inplace=True)

# The final test data prepared for prediction
X_final_test = df_test_features

# Use the best estimator found by RandomizedSearchCV to make predictions on the test set
# This ensures the data is scaled and the prediction is made using the best model.
y_pred_final = xgb_randomCV.best_estimator_.predict(X_final_test)

# Load the sample submission file to get the required format
submission = pd.read_csv("/content/sample_submission.csv")

# Replace the 'SalePrice' column in the submission file with the predictions
submission['SalePrice'] = y_pred_final

# Save the submission file in the specified format
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")