In [None]:
#DIAMOND PRICE ANALYSIS WITH MACHINE LEARNING
#Importing our tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder #It converts categorical text values into numeric values
from sklearn.linear_model import LinearRegression
#Predicts output using a straight-line relationship between input and output.
from sklearn.ensemble import RandomForestRegressor
#Predicts output by combining predictions from multiple decision trees to improve accuracy.
#To compare simple and complex models and choose the best-performing one
from sklearn. metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
#makeing plots look preety
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] =(10,6) #is used to set a consistent and readable size for all plots.

In [None]:
#STEP-1
#LOAD DATA
df = pd.read_csv("C:/Users/krish/vscode/ML projects/Diamond Price Analysis Prediction/diamonds.csv")

#show the first few diamonds
print("First 5 diamonds in our dataset")
print(df.head())

#check the shape 
print(f"We have {len(df)} diamonds with {len(df.columns)} features!")

In [None]:
#STEP 2 eXPLORE THE DATA
print("Statistical summary:")
print(df.describe())
#Is used to generate descriptive statistics that summarize the distribution, central tendency, and spread of numerical features in the dataset.

#check for missing values
print("\n Missing values:")
print(df.isnull().sum())

#check datatypes
print("\n Data types:")
print(df.dtypes)

In [None]:
#STEP 3 VISUALIZE THE PATTERNS

#1. HOW DOES CARAT (SIZE) AFFECT THE PRICE?

plt.figure(figsize=(10,6))
plt.scatter(df['carat'],df['price'],alpha = 0.3, c='steelblue')
plt.xlabel('Carat (Size)', fontsize=12)
plt.ylabel('Price($)' , fontsize=12)
plt.title('Effect of Diamond Size on Prices', fontsize=14,fontweight ='bold')
plt.show()

In [None]:
#The scatter plot shows a strong positive relationship between diamond carat and price. However, the relationship is non-linear, with significant price variation at the same carat value due to other quality factors like cut, color, and clarity.

In [None]:
#2.PRICE DISTRIBUTION ACROSS DIFFERENT CUTS

plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='cut',y='price',palette='Set2')
plt.xlabel('Cut Quality', fontsize=12)
plt.ylabel('Price($)', fontsize=12)
plt.title('How Cut Quality Affects the Price', fontsize=14,fontweight='bold')
plt.xticks(rotation=45)
plt.show()

In [None]:
#This boxplot shows that better cut diamonds usually have higher prices, but there is a wide price range in every cut because other factors also affect the price.

In [None]:
#3.PRICE BY COLOR GRADE
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='color',y='price',palette='viridis')
plt.xlabel('Color Grade', fontsize=12)
plt.ylabel('Price($)',fontsize=12)
plt.title('Diamond color VS Price($)',fontsize=14,fontweight='bold')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Middle line â†‘ â†’ more expensive
#Box tall â†’ price varies
#Dots â†’ rare expensive diamonds(outliers)
#Color affects price, but it does not decide price alone.

In [None]:
#4. CORREALATION HEATMAP
plt.figure(figsize=(10,8))
numeric_cols= df.select_dtypes(include=[np.number]).columns
#Selects only numeric columns from the dataset
correalation = df[numeric_cols].corr()
#Calculates correlation between numeric features
sns.heatmap(correalation,annot=True,cmap='coolwarm',center=0)
plt.title('Feature Correaltion Map', fontsize=14,fontweight='bold')
plt.show()

In [None]:
#The correlation heatmap shows that carat has the strongest positive relationship with price, while depth and table have very weak influence. It also reveals multicollinearity among carat and the physical dimensions, which justifies using tree-based models like Random Forest.

In [None]:
#STEP4:PREAPARE THE DATA 
#Remove any unamed index column if it exits
if 'Unnamed: 0' in df.columns:
    df= df.drop('Unnamed: 0', axis=1)  #axis=1 means column
    
#Encode categorical varaibles(cut,color,clarity)
#we convert text categories into numbers the model can understand

label_encoder={}
categorical_columns =['cut','color','clarity']

for col in categorical_columns:
    le=LabelEncoder()
    df[col+ '_encoded'] = le.fit_transform(df[col])
    label_encoder[col]=le
    print(f"Encoded {col}: {df[col].unique()} -> {df[col+ '_encoded'].unique()}")
    


In [None]:
#select features for our model
feature_columns =['carat','cut_encoded','color_encoded','clarity_encoded','depth','table','x','y','z']
X=df[feature_columns]  #Input Features
y=df['price']  # taregt (what we predict)
print(f"\n Ready ! we have {len(feature_columns)} features to predict price")

In [None]:
#SPLIT THE DATA
#80% for training ,20% for testing
X_train,X_test,y_train,y_test = train_test_split(
    X,y, test_size=0.2,random_state=42
)

print(f"Training set: len{X_train} diamonds")
print(f"Testing set: len{X_test} diamonds")


In [None]:
#STEP 6: TRAIN MODEL #1 LINEAR REGRESSION
#create and train the model
print("Training Linear regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

#make predictions
y_pred_lr = lr_model.predict(X_test)

#evaluate
#Average error in prediction
mae_lr = mean_absolute_error(y_test,y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test,y_pred_lr))#Large errors are punished more here.
r2_lr= r2_score(y_test,y_pred_lr)#Tells how much price variation the model explains

print("\n Linear Regression Results:")
print(f" MAE (average error): ${mae_lr}")
print(f" RMSE (root mean sqaured error): ${rmse_lr}")
print(f" RÂ² Score: {r2_lr:.4f} ({r2_lr*100:.2f}% varaince explained) ")

#show which features matter most

feature_importance = pd.DataFrame(
{
    'Feature':feature_columns,
    'Importance': lr_model.coef_
}
).sort_values('Importance',ascending=False)
print("\n Most Important Features:")
print(feature_importance)

In [None]:
#Plot feature importance
plt.figure(figsize=(10,6))
plt.barh(feature_importance['Feature'],feature_importance['Importance'], color='forestgreen')
plt.xlabel('Importance',fontsize=12)
plt.title('Feature Matter most', fontsize=14,fontweight='bold')
plt.gca().invert_yaxis()
plt.show()

In [None]:
#TRAIN MODEL #2 RANDOM FOREST
#Create and train the model
print("Training Random Forest model...")
rf_model =RandomForestRegressor(
    n_estimators = 100,
    random_state= 42,
    n_jobs=-1
)

rf_model.fit(X_train,y_train)
#make predictions
y_pred_rf = rf_model.predict(X_test)

#evaluate
mae_rf = mean_absolute_error(y_test,y_pred_rf)
rmse_rf =np.sqrt(mean_squared_error(y_test,y_pred_rf))
r2_rf = r2_score(y_test,y_pred_rf)

print("\n Random Forest Results:")
print(f" MAE avg error: $ {mae_rf:.2f}")
print(f" RMSE root ${rmse_rf:.2f}")
print(f" RÂ² score:{r2_rf:4f} ({r2_rf*100:.2f})% variance explained ")
      
#feature importance from random forest
feature_importance_rf =pd.DataFrame({
    'Feature':feature_columns,
    'Importance':rf_model.feature_importances_
}).sort_values('Importance', ascending =False)
print("\n Feature Importance(random forest):")
print(feature_importance_rf)

In [None]:
#Plot feature importance
plt.figure(figsize=(10,6))
plt.barh(feature_importance_rf['Feature'],feature_importance_rf['Importance'], color='forestgreen')
plt.xlabel('Importance',fontsize=12)
plt.title('Feature Matter most', fontsize=14,fontweight='bold')
plt.gca().invert_yaxis()
plt.show()

In [None]:
import joblib

joblib.dump(lr_model, "linear_regression_model.pkl")
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

print("Models saved successfully!")


In [None]:
#STEP 8: COMPARE MODELS
#create comparison

comparison =pd.DataFrame({
    'Model':['Linear Regression','Random Forest'],
    'MAE':[mae_lr,mae_rf],
    'RMSE':[rmse_lr,rmse_rf],
    'RÂ² Score':[r2_lr,r2_rf]
})

print("\n Model Comparison")
print(comparison.to_string(index=False))


In [None]:
#visualise actual vs predicted prices
#creates a figure with two side by side plots to compare two models
fig,axes = plt.subplots(1,2, figsize =(15,6))

#linear regression
#plots actual vs predicted prices
axes[0].scatter(y_test,y_pred_lr,alpha =0.5, c='steelblue')
#Draws the perfect prediction line for reference.
axes[0].plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()],'r--',lw=2)
axes[0].set_xlabel('Actual price($)', fontsize=12)
axes[0].set_ylabel('Predicted Price($)', fontsize=12)
axes[0].set_title('Linear Regression Predictions', fontsize=12,fontweight='bold')

#Random Forest
axes[1].scatter(y_test, y_pred_rf, alpha=0.5, c='forestgreen')
axes[1].plot([y_test.min(), y_test.max()],[y_test.min(),y_test.max()],'r--',lw=2)
axes[1].set_xlabel('Actual Price($)', fontsize=12)
axes[1].set_ylabel('Predicted Price($)' ,fontsize=12)
axes[1].set_title('Random Forest Predictions', fontsize=12,fontweight='bold')

#Adjusts spacing so plots donâ€™t overlap.
plt.tight_layout()
plt.show()

print("\n The red line = perfect predictions.closer points = better model!")

In [None]:
%%writefile app.py
#STEP 9: MAKE PREDICTIONS ON NEW DIAMONDS 

import streamlit as st
import joblib
import numpy as np

@st.cache_resource
def load_model():
    lr_model= joblib.load( "linear_regression_model.pkl")
    rf_model=joblib.load("random_forest_model.pkl")
    label_encoder=joblib.load( "label_encoder.pkl")
    return lr_model,rf_model, label_encoder

lr_model, rf_model, label_encoder = load_model()

def predict_diamond_price(carat,cut,color,clarity,depth,table,x,y,z):
    """ 
    Predict the price of a diamond based on its characteristics
    """
    
    # encode the categorical features
    cut_encoded = label_encoder['cut'].transform([cut])[0]
    color_encoded = label_encoder['color'].transform([color])[0]
    clarity_encoded = label_encoder['clarity'].transform([clarity])[0]
    
    #create feature  array
    features =np.array([[carat,cut_encoded,color_encoded,clarity_encoded,depth,table,x,y,z]])
    
    #predict with both models
    price_lr = lr_model.predict(features)[0]
    price_rf = rf_model.predict(features)[0]
    avg_price = (price_lr + price_rf)/2
    
    
    st.subheader(f"\n Diamond charcteristics:")
    st.write(f" \nCarat : {carat}, Cut:{cut},Color:{color},Clarity:{clarity},DepthL{depth},Table:{table},X:{x},Y:{y},Z:{z}")
    st.subheader(f"\n Predicted Prices:")
    st.write(f"  Linear Regression: $ {price_lr:.2f}")
    st.write(f"  Random Forest:     $ {price_rf:.2f}")
    st.write(f"  Average Prediction:$ {(price_lr + price_rf)/2:.2f}")
    
    return avg_price

st.title("ðŸ’Ž Diamond Price Predictor")

carat = st.number_input("Carat",min_value=0.01,step=0.01)
cut = st.selectbox("Cut", label_encoder['cut'].classes_)
color =st.selectbox("Color", label_encoder['color'].classes_)
clarity =st.selectbox("Clarity", label_encoder['clarity'].classes_)
depth = st.number_input("Depth",min_value=0.0,step=0.1)
table= st.number_input("Table",min_value=0.0,step=0.1)
x= st.number_input("X(length in mm), min_value=0.0",step=0.01)
y= st.number_input("Y(length in mm), min_value=0.0",step=0.01)
z= st.number_input("Z(length in mm), min_value=0.0",step=0.01)

if st.button("Predict Price"):
    predict_diamond_price(carat,cut,color ,clarity,depth,table,x,y,z)

In [None]:
!streamlit run app.py


In [None]:
# Test it out!
predicted_price = predict_diamond_price(
    carat=1.0,
    cut='Ideal',
    color='E',
    clarity='VS1',
    depth=61.5,
    table=55,
    x=6.4,
    y=6.5,
    z=4.0
)