In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# Load the dataset
df = pd.read_csv('diamonds.csv')

In [3]:
# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nStatistical summary:")
print(df.describe())

Dataset shape: (53940, 11)

First few rows:
   Unnamed: 0  carat      cut color clarity  depth  table  price     x     y  \
0           1   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98   
1           2   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84   
2           3   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07   
3           4   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23   
4           5   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35   

      z  
0  2.43  
1  2.31  
2  2.31  
3  2.63  
4  2.75  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5  

In [4]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Unnamed: 0    0
carat         0
cut           0
color         0
clarity       0
depth         0
table         0
price         0
x             0
y             0
z             0
dtype: int64


In [5]:
# Drop the index column if it exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

In [6]:
# Select relevant features for prediction
# Features: carat, cut, color, clarity, depth, table, x, y, z
# Target: price
selected_features = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']
diamond_data = df[selected_features].copy()

In [7]:
# Handle missing values or invalid entries
# Replace any '?' with NaN
col_names = diamond_data.columns
for c in col_names:
    diamond_data[c] = diamond_data[c].replace("?", np.NaN)

In [8]:
# Fill missing values with mode for categorical and median for numerical
for col in diamond_data.columns:
    if diamond_data[col].dtype == 'object':
        diamond_data[col] = diamond_data[col].fillna(diamond_data[col].mode()[0])
    else:
        diamond_data[col] = diamond_data[col].fillna(diamond_data[col].median())

print("\nMissing values after treatment:")
print(diamond_data.isnull().sum())


Missing values after treatment:
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64


In [9]:
# Encode categorical variables
cat_col = ['cut', 'color', 'clarity']
labelEncoder = preprocessing.LabelEncoder()
mapping_dict = {}

for col in cat_col:
    diamond_data[col] = labelEncoder.fit_transform(diamond_data[col])
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                              labelEncoder.transform(labelEncoder.classes_)))
    mapping_dict[col] = le_name_mapping

print("\nEncoding mappings:")
print(mapping_dict)


Encoding mappings:
{'cut': {'Fair': 0, 'Good': 1, 'Ideal': 2, 'Premium': 3, 'Very Good': 4}, 'color': {'D': 0, 'E': 1, 'F': 2, 'G': 3, 'H': 4, 'I': 5, 'J': 6}, 'clarity': {'I1': 0, 'IF': 1, 'SI1': 2, 'SI2': 3, 'VS1': 4, 'VS2': 5, 'VVS1': 6, 'VVS2': 7}}


In [10]:
# Save the mapping dictionary for later use in Flask app
with open('label_mappings.pkl', 'wb') as f:
    pickle.dump(mapping_dict, f)

In [11]:
# Prepare features and target
X = diamond_data.drop('price', axis=1)  # features
y = diamond_data['price']  # target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 37758
Test set size: 16182


In [12]:
# ==========================================
# Model 1: Linear Regression
# ==========================================
from sklearn.linear_model import LinearRegression

print("\n" + "="*50)
print("LINEAR REGRESSION")
print("="*50)

LRregressor = LinearRegression()
LRregressor.fit(X_train, y_train)

# Calculate scores
lr_train_score = LRregressor.score(X_train, y_train)
lr_test_score = LRregressor.score(X_test, y_test)

print(f"Train R¬≤ score: {lr_train_score:.4f}")
print(f"Test R¬≤ score: {lr_test_score:.4f}")

# Make predictions
ypred_LR = LRregressor.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, ypred_LR))
lr_mae = mean_absolute_error(y_test, ypred_LR)

print(f"RMSE: {lr_rmse:.2f}")
print(f"MAE: {lr_mae:.2f}")


LINEAR REGRESSION
Train R¬≤ score: 0.8845
Test R¬≤ score: 0.8864
RMSE: 1331.26
MAE: 856.41


In [13]:
# ==========================================
# Model 2: Decision Tree Regressor
# ==========================================
from sklearn.tree import DecisionTreeRegressor

print("\n" + "="*50)
print("DECISION TREE REGRESSOR")
print("="*50)

DTregressor = DecisionTreeRegressor(max_depth=10, random_state=42)
DTregressor.fit(X_train, y_train)

# Calculate scores
dt_train_score = DTregressor.score(X_train, y_train)
dt_test_score = DTregressor.score(X_test, y_test)

print(f"Train R¬≤ score: {dt_train_score:.4f}")
print(f"Test R¬≤ score: {dt_test_score:.4f}")

# Make predictions
ypred_DTr = DTregressor.predict(X_test)
dt_rmse = np.sqrt(mean_squared_error(y_test, ypred_DTr))
dt_mae = mean_absolute_error(y_test, ypred_DTr)

print(f"RMSE: {dt_rmse:.2f}")
print(f"MAE: {dt_mae:.2f}")


DECISION TREE REGRESSOR
Train R¬≤ score: 0.9812
Test R¬≤ score: 0.9736
RMSE: 641.51
MAE: 344.23


In [14]:
# ==========================================
# Model 3: Random Forest Regressor
# ==========================================
from sklearn.ensemble import RandomForestRegressor

print("\n" + "="*50)
print("RANDOM FOREST REGRESSOR")
print("="*50)

RFregressor = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
RFregressor.fit(X_train, y_train)

# Calculate scores
rf_train_score = RFregressor.score(X_train, y_train)
rf_test_score = RFregressor.score(X_test, y_test)

print(f"Train R¬≤ score: {rf_train_score:.4f}")
print(f"Test R¬≤ score: {rf_test_score:.4f}")
print(f"Number of features: {RFregressor.n_features_in_}")
print(f"Feature importances: {RFregressor.feature_importances_}")

# Make predictions
ypred_RFr = RFregressor.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, ypred_RFr))
rf_mae = mean_absolute_error(y_test, ypred_RFr)

print(f"RMSE: {rf_rmse:.2f}")
print(f"MAE: {rf_mae:.2f}")


RANDOM FOREST REGRESSOR
Train R¬≤ score: 0.9944
Test R¬≤ score: 0.9814
Number of features: 9
Feature importances: [0.62565245 0.00120135 0.02848111 0.06547832 0.00258115 0.0019082
 0.00642964 0.26327585 0.00499193]
RMSE: 538.36
MAE: 271.67


In [15]:
# ==========================================
# Model 4: Gradient Boosting Regressor
# ==========================================
from sklearn.ensemble import GradientBoostingRegressor

print("\n" + "="*50)
print("GRADIENT BOOSTING REGRESSOR")
print("="*50)

GBregressor = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
GBregressor.fit(X_train, y_train)

# Calculate scores
gb_train_score = GBregressor.score(X_train, y_train)
gb_test_score = GBregressor.score(X_test, y_test)

print(f"Train R¬≤ score: {gb_train_score:.4f}")
print(f"Test R¬≤ score: {gb_test_score:.4f}")

# Make predictions
ypred_GBr = GBregressor.predict(X_test)
gb_rmse = np.sqrt(mean_squared_error(y_test, ypred_GBr))
gb_mae = mean_absolute_error(y_test, ypred_GBr)

print(f"RMSE: {gb_rmse:.2f}")
print(f"MAE: {gb_mae:.2f}")


GRADIENT BOOSTING REGRESSOR
Train R¬≤ score: 0.9850
Test R¬≤ score: 0.9810
RMSE: 544.47
MAE: 288.10


In [16]:
# ==========================================
# Model 5: Support Vector Regressor
# ==========================================
from sklearn.svm import SVR

print("\n" + "="*50)
print("SUPPORT VECTOR REGRESSOR")
print("="*50)

SVMregressor = SVR(kernel='rbf', C=1000, gamma=0.1)
SVMregressor.fit(X_train, y_train)

# Calculate scores
svm_train_score = SVMregressor.score(X_train, y_train)
svm_test_score = SVMregressor.score(X_test, y_test)

print(f"Train R¬≤ score: {svm_train_score:.4f}")
print(f"Test R¬≤ score: {svm_test_score:.4f}")

# Make predictions
ypred_SVMr = SVMregressor.predict(X_test)
svm_rmse = np.sqrt(mean_squared_error(y_test, ypred_SVMr))
svm_mae = mean_absolute_error(y_test, ypred_SVMr)

print(f"RMSE: {svm_rmse:.2f}")
print(f"MAE: {svm_mae:.2f}")



SUPPORT VECTOR REGRESSOR
Train R¬≤ score: 0.9508
Test R¬≤ score: 0.9512
RMSE: 872.14
MAE: 427.80


In [17]:
# ==========================================
# Model Comparison
# ==========================================
print("\n" + "="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)

comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 
              'Gradient Boosting', 'SVM'],
    'Train R¬≤': [lr_train_score, dt_train_score, rf_train_score, 
                 gb_train_score, svm_train_score],
    'Test R¬≤': [lr_test_score, dt_test_score, rf_test_score, 
                gb_test_score, svm_test_score],
    'RMSE': [lr_rmse, dt_rmse, rf_rmse, gb_rmse, svm_rmse],
    'MAE': [lr_mae, dt_mae, rf_mae, gb_mae, svm_mae]
})

print(comparison_df.to_string(index=False))


MODEL COMPARISON SUMMARY
            Model  Train R¬≤  Test R¬≤        RMSE        MAE
Linear Regression  0.884512 0.886364 1331.262679 856.408322
    Decision Tree  0.981248 0.973613  641.505390 344.229692
    Random Forest  0.994356 0.981416  538.360204 271.674813
Gradient Boosting  0.984971 0.980992  544.466692 288.095801
              SVM  0.950786 0.951229  872.142171 427.804797


In [18]:
# Select the best model based on test R¬≤ score
best_model_idx = comparison_df['Test R¬≤'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test R¬≤: {comparison_df.loc[best_model_idx, 'Test R¬≤']:.4f}")
print(f"   RMSE: {comparison_df.loc[best_model_idx, 'RMSE']:.2f}")


üèÜ Best Model: Random Forest
   Test R¬≤: 0.9814
   RMSE: 538.36


In [19]:
# Save the best model
models = {
    'Linear Regression': LRregressor,
    'Decision Tree': DTregressor,
    'Random Forest': RFregressor,
    'Gradient Boosting': GBregressor,
    'SVM': SVMregressor
}

best_model = models[best_model_name]

# Save the best model
pickle.dump(best_model, open('model.pkl', 'wb'))
print(f"\n‚úì Model saved as 'model.pkl'")


‚úì Model saved as 'model.pkl'


In [20]:
# Test loading the model
model_loaded = pickle.load(open('model.pkl', 'rb'))
test_prediction = model_loaded.predict(X_test[:5])
print(f"\nTest prediction (first 5 samples): {test_prediction}")
print(f"Actual values (first 5 samples): {y_test.iloc[:5].values}")


Test prediction (first 5 samples): [ 545.51279452 2414.97875036 1183.39392735 1243.05241328 9317.64827014]
Actual values (first 5 samples): [ 559 2201 1238 1304 6901]
