<a href="https://colab.research.google.com/github/hafsaaah/Diamond-Price-predcition/blob/main/model_trainingg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [2]:
## Independent and dependent features
df = pd.read_csv('/content/drive/MyDrive/training_datanew.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.52,Premium,F,VS2,62.2,58.0,13619
1,2.03,Very Good,J,SI2,62.0,58.0,13387
2,0.7,Ideal,G,VS1,61.2,57.0,2772
3,0.32,Ideal,G,VS1,61.6,56.0,666
4,1.7,Premium,G,VS2,62.6,59.0,14453


In [3]:

X = df.drop(['price'],axis=1)
Y = df[['price']]

In [4]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [5]:
print(numerical_cols)
print(categorical_cols)

Index(['carat', 'depth', 'table'], dtype='object')
Index(['cut', 'color', 'clarity'], dtype='object')


In [6]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [14]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)


preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [15]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [16]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=[['carat', 'depth', 'table', 'cut', 'color', 'clarity']])
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=[['carat', 'depth', 'table', 'cut', 'color', 'clarity']])

In [17]:
X_train.head()

Unnamed: 0,carat,depth,table,cut,color,clarity
0,1.532871,-0.016993,0.404099,1.23423,1.53002,0.680797
1,1.792135,-1.498069,-0.117341,1.23423,0.91371,0.015564
2,0.47421,0.075574,-0.117341,0.099182,-0.318911,1.346029
3,-1.038162,0.53841,0.404099,-1.035866,2.14633,0.680797
4,0.495816,-2.79401,-0.63878,1.23423,-1.551531,-0.649668


Model **Training**

In [18]:

from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [19]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [20]:
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}

trained_model_list = []
model_list = []
r2_list = []

# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score for test:", r2_square * 100)
    print("R2 score for train:", r2_score(y_train, model.predict(X_train)) * 100)

    r2_list.append(r2_square)

    print('=' * 35)
    print('\n')

  return fit_method(estimator, *args, **kwargs)


RandomForest
Model Training Performance
RMSE: 642.1533428053226
MAE: 331.31593535676586
R2 score for test: 97.45793682496992
R2 score for train: 99.54633991203467


XGBoost
Model Training Performance
RMSE: 590.3509004411378
MAE: 301.3726208792088
R2 score for test: 97.85153269767761
R2 score for train: 98.30867648124695




In [21]:

# Load your unseen data
# Replace 'path_to_unseen_data.csv' with your actual unseen data file path
unseen_data = pd.read_csv('/content/drive/MyDrive/unseen_datanew.csv')

# Preprocess the unseen data (same preprocessing steps as training)
# Assuming 'preprocessor' is defined and includes the transformations
X_unseen = unseen_data[numerical_cols + categorical_cols]  # Adjust this to your actual feature columns
X_unseen_transformed = preprocessor.transform(X_unseen)

# Make predictions on the unseen dataset using the trained XGBoost model
xgbr_model = models['XGBoost']  # Assuming you have trained this model already
y_unseen_pred = xgbr_model.predict(X_unseen_transformed)

# Create a DataFrame with actual and predicted prices
results = pd.DataFrame({
    'Actual Price': unseen_data['price'],  # Replace 'price' with the actual price column name
    'Predicted Price': y_unseen_pred
})

# Save the results to a CSV file
results.to_csv('/content/drive/My Drive/predicted_prices.csv', index=False)

print("Predicted prices saved to Google Drive.")


Predicted prices saved to Google Drive.
