In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import mean_squared_error 
import xgboost as xgb

In [2]:
df = pd.read_csv('gemstone.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [None]:
scaler = StandardScaler()
df["x_scaled"] = scaler.fit_transform(df["x"].values.reshape(-1, 1))
df['y_scaled'] = scaler.fit_transform(df['y'].values.reshape(-1, 1))
df['z_scaled'] = scaler.fit_transform(df['z'].values.reshape(-1, 1))

In [5]:
df["carat_mean_imputed"] = df["carat"].fillna(df["carat"].mean())

In [6]:
df["carat_with_clarity_mean"] = df["carat"].fillna(
    df.groupby("clarity")["carat"].transform("mean")    
)

In [7]:
num_bins = int(np.floor(np.log2(df.shape[0]) + 1))

df["carat_categories"] = pd.cut(df["carat"], num_bins, labels=False)

In [14]:
cut_label_map = {
    "Fair": 0,
    "Good": 1,
    "Ideal": 2,
    "Very Good": 3,
    "Premium": 4
}
df["cut"] = df["cut"].map(cut_label_map)

In [17]:
df['carat'] = df['carat'].astype(float)

In [19]:
df = pd.get_dummies(df, columns=["color","clarity"], drop_first=True)

In [25]:
X = df.drop(['price'],axis=1)
y = df[['price']]

X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train XGBoost model
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [26]:
mean_squared_error(y_test, y_pred)

0.023233153653638847