In [None]:
pip install dataprep

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.preprocessing import OrdinalEncoder
from dataprep.eda import create_report
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
sns.set()

In [None]:
df=pd.read_excel("/content/Copper_Set.xlsx")

# Finding Outliers

In [None]:
create_report(df)

In [None]:
# verify the number of unique values in each col
for i in list(df.columns):
  print(f"{i}:{df[i].nunique()}")

In [None]:
# check any null values in data
df.isnull().sum()

In [None]:
# verify datatypes of all column
df.dtypes

# Pre processing / Data Cleaning

In [None]:
# ‘Material_ref’ which starts with ‘00000’ value which should be converted into null
df['material_ref'] = df['material_ref'].apply(lambda x: np.nan if str(x).startswith('00000') else x)
df.head()

In [None]:
# removing a string as quantity tons assuming as numerical
df['quantity tons'].replace('e',np.nan, regex=True, inplace=True)

In [None]:
df['quantity tons']=df['quantity tons'].astype(float)

In [None]:
#numerical datatype using median
df['quantity tons'].fillna(df['quantity tons'].median(), inplace=True)
df['customer'].fillna(df['customer'].median(), inplace=True)
df['application'].fillna(df['application'].median(), inplace=True)
df['thickness'].fillna(df['thickness'].median(), inplace=True)
df['width'].fillna(df['width'].median(), inplace=True)
df['selling_price'].fillna(df['selling_price'].median(), inplace=True)

In [None]:
# Fill with mode
df['country'].fillna(df['country'].mode().iloc[0], inplace=True)
df['status'].fillna(df['status'].mode().iloc[0], inplace=True)
df['item_date'].fillna(df['item_date'].mode().iloc[0], inplace=True)
df['delivery date'].fillna(df['delivery date'].mode().iloc[0], inplace=True)

In [None]:
# fill -ve values with mean
mean_selling_price = df.loc[df['selling_price'] > 0, 'selling_price'].mean()
df['selling_price'] = df['selling_price'].apply(lambda x: mean_selling_price if x <= 0 else x)

In [None]:
# checking is -ve values in quantity tons
df.loc[df['quantity tons'] <=0]

In [None]:
# fill -ve values with mean
mean_quantity_tons = df.loc[df['quantity tons'] > 0, 'quantity tons'].mean()
df['quantity tons'] = df['quantity tons'].apply(lambda x: mean_quantity_tons if x <= 0 else x)

In [None]:
df.isnull().sum()

In [None]:
# more than 50% null values in material_ref
df.drop("material_ref", axis=1, inplace=True)
# 2 rows is null in id column and its not create any impact on model so we can drop
df.drop("id", axis=1, inplace=True)

In [None]:
df['item type'].unique()

array(['W', 'WI', 'S', 'Others', 'PL', 'IPL', 'SLAWR'], dtype=object)

In [None]:
# categorical data changed into numerical
df['item type'] = df['item type'].map({'W':0, 'WI':1, 'S':2, 'PL':3, 'IPL':4,
                                 'SLAWR':5, 'Others':6})


# Box, Distribution, Violin plots

In [None]:
# box plot, hist plot and violin plot
def plot(df, column):
    plt.figure(figsize=(20,5))
    plt.subplot(1,3,1)
    sns.boxplot(data=df, x=column)
    plt.title(f'Box Plot for {column}')

    plt.subplot(1,3,2)
    sns.histplot(data=df, x=column, kde=True, bins=50)
    plt.title(f'Distribution Plot for {column}')

    plt.subplot(1,3,3)
    sns.violinplot(data=df, x=column)
    plt.title(f'Violin Plot for {column}')
    plt.show()

In [None]:
# These 4 columns taken for prediction.

for i in ['quantity tons', 'thickness', 'width', 'selling_price']:
    plot(df, i)

# Log Transformation

In [None]:
# using the log transformation method to handle the skewness data
df_log = df.copy()
df_log['quantity tons_log'] = np.log(df_log['quantity tons'])
df_log['thickness_log'] = np.log(df_log['thickness'])
df_log['selling_price_log'] = np.log(df_log['selling_price'])
df_log

In [None]:
df_log.isnull().sum()

In [None]:
# after log transformation the data are normally distributed and reduced the skewness. [hist plot and violin plot]
for i in ['quantity tons_log', 'thickness_log', 'width', 'selling_price_log']:
    plot(df_log, i)

In [None]:
# checking any -ve values in log transformed data
df_log[df_log['selling_price_log']<=0]
df_log[df_log['quantity tons_log']<0]
df_log[df_log['thickness_log']<0]

In [None]:
# fill -ve values with mean
mean_selling_price_log = df_log.loc[df_log['selling_price_log'] > 0, 'selling_price_log'].mean()
df_log['selling_price_log'] = df_log['selling_price_log'].apply(lambda x: mean_selling_price_log if x <= 0 else x)
mean_quantity_tons_log = df_log.loc[df_log['quantity tons_log'] > 0, 'quantity tons_log'].mean()
df_log['quantity tons_log'] = df_log['quantity tons_log'].apply(lambda x: mean_quantity_tons_log if x <= 0 else x)
mean_thickness_log = df_log.loc[df_log['thickness_log'] > 0, 'thickness_log'].mean()
df_log['thickness_log'] = df_log['thickness_log'].apply(lambda x: mean_thickness_log if x <= 0 else x)


# IQR Method

In [None]:
df_iqr = df_log.copy()
df_iqr

In [None]:
# Using IQR and clip() methods to handle the outliers and add a new column of dataframe

def outlier(df, column):
    iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
    upper_threshold = df[column].quantile(0.75) + (1.5*iqr)
    lower_threshold = df[column].quantile(0.25) - (1.5*iqr)
    df[column] = df[column].clip(lower_threshold, upper_threshold)

In [None]:
outlier(df_iqr, 'quantity tons_log')
outlier(df_iqr, 'thickness_log')
outlier(df_iqr, 'selling_price_log')
outlier(df_iqr, 'width')
df_iqr

In [None]:
df_iqr.isnull().sum()

In [None]:

# transform the outliers to within range using IQR and clip() methods - box plot

for i in ['quantity tons_log', 'thickness_log', 'width', 'selling_price_log']:
    plot(df_iqr, i)

In [None]:
# after add the new column of 'quantity tons_log', 'thickness_log', 'selling_price_log', drop the existing columns
df_iqr.drop(columns=['quantity tons', 'thickness', 'selling_price'], inplace=True)
df_iqr

In [None]:
# Need to verify any columns are highly correlated using Heatmap. If any columns correalaion value >= 0.7 (absolute value), drop the columns.

col = ['quantity tons_log','customer','country','status','application','width','product_ref','thickness_log','selling_price_log']
df_heatmap = df_iqr[col].corr()
sns.heatmap(df_heatmap, annot=True, linewidths=0.5, fmt='.2f')

In [None]:
# The highest value is (0.4 or -0.42) only, So there is no columns are highly correlated and no need to drop any columns.

# Selling Price Prediction - Regression Model

In [None]:
df_regrsn=df_iqr.copy()
df_regrsn.drop(['item_date','customer','country','status','item type','application','product_ref','delivery date'],axis=1, inplace=True)
df_regrsn

In [None]:
create_report(df_regrsn)

In [None]:
y= df_regrsn['selling_price_log']
x = df_regrsn.drop('selling_price_log', axis =1)

In [None]:
# check any null values in data
df_regrsn.isnull().sum()


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((145338, 3), (36335, 3), (145338,), (36335,))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor(max_depth=40).fit(x_train, y_train)
y_pred= model_rfr.predict(x_test)
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

In [None]:
df_regrsn

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model_abr = AdaBoostRegressor().fit(x_train, y_train)
y_pred=model_abr.predict(x_test)
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(max_depth=40).fit(x_train, y_train)
y_pred=model_gbr.predict(x_test)
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# Sample data
test_data=np.array([[1220,4.6296,1.1006]])
y_pred_gbr=model_gbr.predict(test_data)

y_pred_gbr[0]  , np.exp(y_pred_gbr[0])

In [None]:
# Sample data
test_data=np.array([[1220,4.6296,1.1006]])
y_pred_abr=model_abr.predict(test_data)
y_pred_rfr=model_rfr.predict(test_data)
y_pred_abr[0]  , y_pred_rfr[0] , np.exp(y_pred_abr[0]),np.exp(y_pred_rfr[0])

In [None]:
with open('/content/regression_model.pkl', 'wb') as f:
    pickle.dump(model_gbr, f)

In [None]:
# load the pickle model to predict selling price

with open('/content/regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Sample test data 2
y_pred = model.predict(np.array([[1240,6.0080,1.1006]]))
np.exp(y_pred[0])

# Status Prediction - Classification Model

In [None]:
# Filter Won and Lost data for status prediction instead deleting rows
df_clssfctn=df.query('status=="Won" or status=="Lost"')

In [None]:
df_clssfctn.drop(['item_date','customer','delivery date'],axis=1, inplace=True)
df_clssfctn

In [None]:
df_clssfctn['status'].unique()

array(['Won', 'Lost'], dtype=object)

In [None]:
df_clssfctn['status'].value_counts()

In [None]:
df_clssfctn['status'] = OrdinalEncoder().fit_transform(df_clssfctn[['status']])
df_clssfctn

In [None]:
create_report(df_clssfctn)

In [None]:
# assign target and features
y= df_clssfctn['status']
x = df_clssfctn.drop('status', axis =1)

In [None]:
df_clssfctn.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=32)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120360, 8), (30090, 8), (120360,), (30090,))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# General for checking different models
def classification_model_selection(ModelName, x_train, y_train, x_test, y_test,test_data):
  model = ModelName().fit(x_train, y_train)
  y_pred=model.predict(x_test)
  accuracy=accuracy_score(y_test, y_pred)
  mse=metrics.mean_squared_error(y_test, y_pred)
  aberr=metrics.mean_absolute_error(y_test, y_pred)
  y_pred=model.predict(test_data)
  res="Accuracy: " + str(accuracy) + " MSE: " + str(mse) + " MAE: " + str(aberr) + "   " +str(y_pred[0])
  return res

In [None]:
# Checking for different models
test_data=np.array([[102.4824,25.0,0,41.0,0.96,1220,164141591,591]])
print(classification_model_selection(RandomForestClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(AdaBoostClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(GradientBoostingClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(ExtraTreesClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(DecisionTreeClassifier, x_train, y_train, x_test, y_test,test_data))

In [None]:
  # This method is used to predict the status based on its processing time and accuracy
  model = RandomForestClassifier().fit(x_train, y_train)
  y_pred=model.predict(x_test)
  test_data=np.array([[102.4824,25.0,0,41.0,0.96,1220,164141591,591]])
  y_pred=model.predict(test_data)
  y_pred

In [None]:
df_clssfctn

In [None]:
# write picket for classification
with open('/content/classification_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# load the pickle model to predict status

with open('/content/classification_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
y_pred = model.predict(np.array([[406.6865,25.0,0,41.0,0.71,1240,164141591,607]]))
y_pred[0]

In [None]:
pip install streamlit

In [None]:
# Streamlit file for deployment
%%writefile app.py
import numpy as np
import pickle
import streamlit as st
from PIL import Image


#streamlit  page setting
icon = Image.open("ml.jpg")
st.set_page_config(page_title= "Copper EDA - Kavitha",
                page_icon= icon,
                layout= "wide",
                initial_sidebar_state= "expanded",
                )

st.subheader(":blue[Industrial Copper Modeling]")
tab1,tab2,tab3=st.tabs([":blue[Selling Price Prediction]",":blue[Status Prediction]",":blue[About]"])

with tab1:
  col1,col2,col3=st.columns(3)
  with col1:
    txt_width=st.number_input("Enter the width")
    #res=checkempty(txt_width,"width")
  with col2:
    txt_quantity_tons=st.number_input("Enter the quantity in tons")
  with col3:
    txt_thickness=st.number_input("Enter the thickness")
  if st.button("Predict Selling Price", key="predict"):
      # load the regression pickle model
      with open('/content/regression_model.pkl', 'rb') as f:
          model = pickle.load(f)

      # make array for all user input values in required order for model prediction
      user_data = np.array([[txt_width,
                          np.log(float(txt_quantity_tons)),
                          np.log(float(txt_thickness))]])

      # model predict the selling price based on user input
      y_pred = model.predict(user_data)

      # inverse transformation for log transformation data
      selling_price = np.exp(y_pred[0])

      # round the value with 2 decimal point
      selling_price = round(selling_price, 2)
      st.write("Predicted Selling Price: ", selling_price)

with tab2:
  col4,col5,col6=st.columns(3)
  with col4:
    txt_quantity_tons=st.number_input("Enter the quantity tons")
    txt_country=st.number_input("Enter country")
    txt_item_type=st.number_input("Enter item type")
  with col5:
    txt_application=st.number_input("Enter application")
    txt_thickness=st.number_input("Enter thickness")
    txt_width=st.number_input("Enter width")
  with col6:
    txt_product_ref=st.number_input("Enter product ref")
    txt_selling_price=st.number_input("Enter selling price")

  if st.button("Predict Status", key="Predict Status"):
      # load the classification pickle model
      with open('/content/classification_model.pkl', 'rb') as f:
          model = pickle.load(f)

      user_data = np.array([[txt_quantity_tons, txt_country, txt_item_type, txt_application,
                            txt_thickness, txt_width, txt_product_ref, txt_selling_price]])

      # model predict status based on user input
      y_pred = model.predict(user_data)

      status = y_pred[0]
      if status==0:
        st.Success("Status: Lost - Failure")
      elif status==1:
        st.Success("Won - Success")
with tab3:
    st.caption(":blue[Overview:]")
    st.caption(":blue[Original Copper dataset has null and zero values]")
    st.caption(":blue[Data Cleaning has done for the all the null values and negative values]")
    st.caption(":blue[log transformation is applied for selling price prediction]")
    st.caption(":blue[3 features(quantity tons, width and thickness) has been taken for selling price prediction]")
    st.caption(":blue[Won and lost status has been taken for prediction other status data has been removed]")
    st.caption(":blue[all features has been taken for status prediction except item date and delivery date]")

Writing app.py
