In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e8/sample_submission.csv
/kaggle/input/playground-series-s3e8/train.csv
/kaggle/input/playground-series-s3e8/test.csv


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from scipy.stats import uniform, randint

# Load the data
train = pd.read_csv("/kaggle/input/playground-series-s3e8/train.csv") # Replace with your actual file path
test = pd.read_csv("/kaggle/input/playground-series-s3e8/test.csv")   # Replace with your actual file path

# Basic data exploration
print(train.head())
print(train.info())
print(train.describe())
print(train.isnull().sum())  # Check for missing values

# Define the target and features
X = train.drop(['id', 'price'], axis=1)  # Drop 'id' and 'price'
y = train['price']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = lgb.LGBMRegressor(random_state=42) #Or any other model

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Initial RMSE: {rmse}")

# Feature Engineering Loop (Example: Add a squared feature)
# This part should be adapted based on your dataset
X_train['carat_squared'] = X_train['carat']**2  # Replace 'carat' with an actual feature name
X_val['carat_squared'] = X_val['carat']**2

# Update numerical_features to include the new feature
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

# Refit the pipeline with the updated data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE with carat_squared: {rmse}")

# Hyperparameter Optimization using GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=3, verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters (GridSearch):", grid_search.best_params_)
print("Best score (GridSearch):", np.sqrt(-grid_search.best_score_))

# Hyperparameter Optimization using RandomizedSearchCV
param_dist = {
    'model__n_estimators': randint(100, 500),
    'model__learning_rate': uniform(0.01, 0.1),
    'model__max_depth': randint(3, 10),
    'model__reg_alpha': uniform(0, 1),
    'model__reg_lambda': uniform(0, 1)
}

random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist,
                                  n_iter=10, scoring='neg_mean_squared_error', cv=3,
                                  verbose=1, random_state=42)

random_search.fit(X_train, y_train)

print("Best parameters (RandomSearch):", random_search.best_params_)
print("Best score (RandomSearch):", np.sqrt(-random_search.best_score_))

# Before training the final model, ensure X includes the new feature
X['carat_squared'] = X['carat']**2

# Update numerical_features for X
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Update preprocessor for the final model
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])

# Train the final model with the best parameters on the entire training dataset
best_model = random_search.best_estimator_  # or grid_search.best_estimator_

best_model.fit(X, y)

# Make predictions on the test set
test['carat_squared'] = test['carat']**2

test_numerical_features = test.select_dtypes(include=np.number).columns.tolist()
test_categorical_features = test.select_dtypes(exclude=np.number).columns.tolist()

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, test_numerical_features),
        ('cat', categorical_transformer, test_categorical_features)
    ])

test_pipeline = Pipeline(steps=[('preprocessor', test_preprocessor),
                       ('model', model)])

# Use the best model directly without refitting
test_predictions = best_model.predict(test)

# Create a submission file
submission = pd.DataFrame({'id': test['id'], 'price': test_predictions})  
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


   id  carat        cut color clarity  depth  table     x     y     z  price
0   0   1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55  13619
1   1   2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05  13387
2   2   0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50   2772
3   3   0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71    666
4   4   1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77  14453
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float6