<a href="https://colab.research.google.com/github/kavitharamanrk/CopperModel/blob/main/Copper_EDA_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Industrial Copper Modeling**

# **Selling Price and Customer Status Prediction**

In [None]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
import openpyxl
sns.set()

# **Data Loading**

In [None]:
# data load from excel file
copper_original_ds=pd.read_excel("/content/Copper_Set.xlsx" )

In [None]:
copper_ds=copper_original_ds.copy()

# **Data Cleaning**

**Finding Empty**

In [None]:
# To check the column names
copper_ds.columns

In [None]:
# column information
copper_ds.info()

In [None]:
# no of rows and columns
copper_ds.shape

In [None]:
# total no of null values in each column
copper_ds.isna().sum()

In [None]:
null_data = copper_ds.isnull().mean()
pd.DataFrame({
    "column_name": null_data.index,
    "Null values":null_data.values
}).sort_values("Null values",ascending=False)

In [None]:
# description of the column
copper_ds.describe()

**Strip/replace**

In [None]:
# removing a string as quantity tons assuming as numerical
copper_ds['quantity tons'].replace('e',np.nan, regex=True, inplace=True)

In [None]:
copper_ds['material_ref'] = copper_ds['material_ref'].astype(str).str.lstrip('0')

**imputation**

In [None]:
#numerical datatype using mean
copper_ds['quantity tons'].fillna(copper_ds['quantity tons'].mean(), inplace=True)
copper_ds['country'].fillna(copper_ds['country'].mean(), inplace=True)
copper_ds['customer'].fillna(copper_ds['customer'].mean(), inplace=True)
copper_ds['application'].fillna(copper_ds['application'].mean(), inplace=True)
copper_ds['thickness'].fillna(copper_ds['thickness'].mean(), inplace=True)
copper_ds['width'].fillna(copper_ds['width'].mean(), inplace=True)
copper_ds['selling_price'].fillna(copper_ds['selling_price'].mean(), inplace=True)

In [None]:
# Fill with mode
copper_ds['status'].fillna(copper_ds['status'].mode().iloc[0], inplace=True)
copper_ds['item_date'].fillna(copper_ds['item_date'].mode().iloc[0], inplace=True)
copper_ds['delivery date'].fillna(copper_ds['delivery date'].mode().iloc[0], inplace=True)
copper_ds['product_ref'].fillna(copper_ds['product_ref'].mode().iloc[0], inplace=True)

In [None]:
# drop null rows as 42% of the data is missing
copper_ds['material_ref'].dropna(inplace=True)

In [None]:
# unique id col has 2 rows is null and its not create any impact on model so we can drop
copper_ds.drop("id", axis=1, inplace=True)

In [None]:
# other than won/lost rows has to delete - these records are in under progress
condition = ~copper_ds['status'].isin(['Won', 'Lost'])
copper_ds.drop(copper_ds[condition].index, inplace=True)

**duplication**

In [None]:
print("Total Duplicated datapoints: ",copper_ds.duplicated().sum())

**Number of "0" data availablity - sparcity**

In [None]:
# full row zero
col_with_zero = []
for i in copper_ds.columns:
    perc_zero = (copper_ds[i]==0).mean()*100
    col_with_zero.append((i,perc_zero))

zero_percent = pd.DataFrame(col_with_zero,columns=['column_name','zero_percentage']).sort_values("zero_percentage",ascending=False)
zero_percent

In [None]:
# fill -ve values with mean
mean_selling_price = copper_ds.loc[copper_ds['selling_price'] > 0, 'selling_price'].mean()
copper_ds['selling_price'] = copper_ds['selling_price'].apply(lambda x: mean_selling_price if x <= 0 else x)

In [None]:
# fill -ve values with mean
mean_quantity_tons = copper_ds.loc[copper_ds['quantity tons'] > 0, 'quantity tons'].mean()
copper_ds['quantity tons'] = copper_ds['quantity tons'].apply(lambda x: mean_quantity_tons if x <= 0 else x)

**Datatype Change**

In [None]:
copper_ds['quantity tons']=copper_ds['quantity tons'].astype(float)

In [None]:
copper_ds['delivery date']=copper_ds['delivery date'].astype(str)
copper_ds['delivery_day']=copper_ds['delivery date'].str[6:8]
copper_ds['delivery_month']=copper_ds['delivery date'].str[4:6]
copper_ds['delivery_year']=copper_ds['delivery date'].str[:4]
copper_ds['item_date']=copper_ds['item_date'].astype(str)
copper_ds['item_day']=copper_ds['item_date'].str[6:8]
copper_ds['item_month']=copper_ds['item_date'].str[4:6]
copper_ds['item_year']=copper_ds['item_date'].str[:4]

In [None]:
copper_ds['delivery_day']=copper_ds['delivery_day'].astype(int)
copper_ds['delivery_month']=copper_ds['delivery_month'].astype(int)
copper_ds['delivery_year']=copper_ds['delivery_year'].astype(int)
copper_ds['item_day']=copper_ds['item_day'].astype(int)
copper_ds['item_month']=copper_ds['item_month'].astype(int)
copper_ds['item_year']=copper_ds['item_year'].astype(int)

In [None]:
copper_ds.drop('delivery date',axis=1,inplace=True)
copper_ds.drop('item_date',axis=1,inplace=True)

**Encoding**

In [None]:
mapping={'WI':'W','W':'W','S':'S','SLAWR':'S','IPL':'PL','PL':'PL','Others':'Others'}
copper_ds['item type']=copper_ds['item type'].map(mapping)

In [None]:
status_mapping={'Won':1,"Lost":2}
copper_ds['status']=copper_ds['status'].map(status_mapping)

In [None]:
item_mapping={'W':1,'S':2,'PL':3,'Others':4}
copper_ds['item type']=copper_ds['item type'].map(item_mapping)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
copper_ds['material_ref'] = le.fit_transform(copper_ds['material_ref'])

# **EDA**

# **Statistical Analysis**

In [None]:
copper_ds.describe()

In [None]:
copper_ds.kurtosis()

In [None]:
copper_ds.skew()

In [None]:
copper_ds.boxplot(figsize=(20,8),rot=90,grid=True)

In [None]:
copper_ds.hist(bins=50, figsize=(20,15))

In [None]:
def plot(df, column):
    sns.violinplot(data=df, x=column)
    plt.title(f'Violin Plot for {column}')
    plt.show()

In [None]:
for i in copper_ds.columns:
    plot(copper_ds, i)

# Finding Outliers

**IQR (Interquartile Range Method)**

In [None]:
def out_iqr(df , column):
    global lower,upper
    q25, q75 = np.quantile(df[column], 0.25), np.quantile(df[column], 0.75)
    # calculate the IQR
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    df1 = df[df[column] > upper]
    df2 = df[df[column] < lower]
    return print(f'Total number of outliers in {column} :', df1.shape[0]+ df2.shape[0])

In [None]:
iqr_ds=copper_ds.copy()
for i in iqr_ds.columns:
  out_iqr(iqr_ds,i)

# **Feature Engineering**

# Log Transformation

In [None]:
# using the log transformation method to handle the skewness data
copper_log_df = copper_ds.copy()
copper_log_df['quantity tons_log'] = np.log(copper_log_df['quantity tons'])
copper_log_df['thickness_log'] = np.log(copper_log_df['thickness'])
copper_log_df['width_log'] = np.log(copper_log_df['width'])
copper_log_df['selling_price_log'] = np.log(copper_log_df['selling_price'])
copper_log_df

In [None]:
# fill -ve values with mean
mean_selling_price_log = copper_log_df.loc[copper_log_df['selling_price_log'] > 0, 'selling_price_log'].mean()
copper_log_df['selling_price_log'] = copper_log_df['selling_price_log'].apply(lambda x: mean_selling_price_log if x <= 0 else x)
mean_quantity_tons_log = copper_log_df.loc[copper_log_df['quantity tons_log'] > 0, 'quantity tons_log'].mean()
copper_log_df['quantity tons_log'] = copper_log_df['quantity tons_log'].apply(lambda x: mean_quantity_tons_log if x <= 0 else x)
mean_thickness_log = copper_log_df.loc[copper_log_df['thickness_log'] > 0, 'thickness_log'].mean()
copper_log_df['thickness_log'] = copper_log_df['thickness_log'].apply(lambda x: mean_thickness_log if x <= 0 else x)

In [None]:
copper_log_df.drop(['quantity tons','thickness','width','selling_price'],axis=1,inplace=True)

In [None]:
copper_log_df.hist(bins=50, figsize=(20,15))

# **Feature Selection**

**Correlation Analysis**

In [None]:
corr_data=copper_ds.corr()
plt.figure(figsize=(16,6))
sns.heatmap(corr_data,annot=True,cmap="coolwarm",fmt=".2f")

# Selling Price Prediction - Regression Model

In [None]:
regrsn_df=copper_log_df.copy()
regrsn_df.drop(['customer','item type','country','status','application','product_ref','material_ref','delivery_day','delivery_month','delivery_year','item_day','item_month','item_year'],axis=1, inplace=True)
regrsn_df

In [None]:
y= regrsn_df['selling_price_log']
x = regrsn_df.drop('selling_price_log', axis =1)

# Model selection

In [None]:
# check any null values in data
regrsn_df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor().fit(x_train, y_train)
y_pred= model_rfr.predict(x_test)
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('R2:',metrics.r2_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model_abr = AdaBoostRegressor().fit(x_train, y_train)
y_pred=model_abr.predict(x_test)
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('R2:',metrics.r2_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(max_depth=40).fit(x_train, y_train)
y_pred=model_gbr.predict(x_test)
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('R2:',metrics.r2_score(y_test, y_pred))

In [None]:
# Sample data
test_data=np.array([[3.99,0.69,7.31]])
y_pred=model_rfr.predict(test_data)
y_pred[0]

In [None]:
# Sample data
test_data=np.array([[3.99,0.69,7.31]])
y_pred=model_abr.predict(test_data)
y_pred[0]

In [None]:
# Sample data
test_data=np.array([[3.99,0.69,7.31]])
y_pred=model_gbr.predict(test_data)
y_pred[0]

In [None]:
regrsn_df.head(2)

In [None]:
with open('/content/regression_model.pkl', 'wb') as f:
    pickle.dump(model_abr, f)

In [None]:
# load the pickle model to predict selling price

with open('/content/regression_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Status Prediction - Classification Model

In [None]:
df_clssfctn=copper_log_df[['quantity tons_log','country','item type','application','thickness_log','width_log','product_ref','selling_price_log','status']].copy()
df_clssfctn

In [None]:
# assign target and features
y= df_clssfctn['status']
x = df_clssfctn.drop('status', axis =1)

In [None]:
df_clssfctn.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=32)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# General for checking different models
def classification_model_selection(ModelName, x_train, y_train, x_test, y_test,test_data):
  model = ModelName().fit(x_train, y_train)
  y_pred=model.predict(x_test)
  accuracy=accuracy_score(y_test, y_pred)
  mse=metrics.mean_squared_error(y_test, y_pred)
  aberr=metrics.mean_absolute_error(y_test, y_pred)
  y_pred=model.predict(test_data)
  res="Accuracy: " + str(accuracy) + " MSE: " + str(mse) + " MAE: " + str(aberr) + "   " +str(y_pred[0])
  return res

In [None]:
df_clssfctn

In [None]:
# Checking for different models
test_data=np.array([[6.9,25,1,41,1.1,7.12,164141591,6.40]])
print(classification_model_selection(RandomForestClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(AdaBoostClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(GradientBoostingClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(ExtraTreesClassifier, x_train, y_train, x_test, y_test,test_data))
print(classification_model_selection(DecisionTreeClassifier, x_train, y_train, x_test, y_test,test_data))

In [None]:
  # This method is used to predict the status based on its processing time and accuracy
  model = RandomForestClassifier().fit(x_train, y_train)
  y_pred=model.predict(x_test)
  test_data=np.array([[6.9,25,1,41,1.1,7.12,164141591,6.40]])
  y_pred=model.predict(test_data)
  y_pred[0]

In [None]:
# write pickle for classification
with open('/content/classification_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# load the pickle model to predict status
with open('/content/classification_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
y_pred = model.predict(np.array([[6.9,25,1,41,1.1,7.12,164141591,6.40]]))
y_pred[0]

In [None]:
pip install streamlit

In [None]:
# Streamlit file for deployment
%%writefile app.py
import numpy as np
import pickle
import streamlit as st
from PIL import Image


#streamlit  page setting
icon = Image.open("ml.jpg")
st.set_page_config(page_title= "Copper EDA - Kavitha",
                page_icon= icon,
                layout= "wide",
                initial_sidebar_state= "expanded",
                )

st.subheader(":blue[Industrial Copper Modeling]")
tab1,tab2,tab3=st.tabs([":blue[Selling Price Prediction]",":blue[Status Prediction]",":blue[About]"])

with tab1:
  col1,col2,col3=st.columns(3)
  with col1:
    txt_width=st.number_input("Enter the width")
    #res=checkempty(txt_width,"width")
  with col2:
    txt_quantity_tons=st.number_input("Enter the quantity in tons")
  with col3:
    txt_thickness=st.number_input("Enter the thickness")
  if st.button("Predict Selling Price", key="predict"):
      # load the regression pickle model
      with open('/content/regression_model.pkl', 'rb') as f:
          model = pickle.load(f)

      # make array for all user input values in required order for model prediction
      user_data = np.array([[txt_width,
                          np.log(float(txt_quantity_tons)),
                          np.log(float(txt_thickness))]])

      # model predict the selling price based on user input
      y_pred = model.predict(user_data)

      # inverse transformation for log transformation data
      selling_price = np.exp(y_pred[0])

      # round the value with 2 decimal point
      selling_price = round(selling_price, 2)
      st.write("Predicted Selling Price: ", selling_price)

with tab2:
  col4,col5,col6=st.columns(3)
  with col4:
    txt_quantity_tons=st.number_input("Enter the quantity tons")
    txt_country=st.number_input("Enter country")
    txt_item_type=st.number_input("Enter item type")
  with col5:
    txt_application=st.number_input("Enter application")
    txt_thickness=st.number_input("Enter thickness")
    txt_width=st.number_input("Enter width")
  with col6:
    txt_product_ref=st.number_input("Enter product ref")
    txt_selling_price=st.number_input("Enter selling price")

  if st.button("Predict Status", key="Predict Status"):
      # load the classification pickle model
      with open('/content/classification_model.pkl', 'rb') as f:
          model = pickle.load(f)

      user_data = np.array([[txt_quantity_tons, txt_country, txt_item_type, txt_application,
                            txt_thickness, txt_width, txt_product_ref, txt_selling_price]])

      # model predict status based on user input
      y_pred = model.predict(user_data)

      status = y_pred[0]
      if status==0:
        st.success("Status: Lost - Failure")
      elif status==1:
        st.success("Won - Success")
with tab3:
    st.caption(":blue[Overview:]")
    st.caption(":blue[Original Copper dataset has null and zero values]")
    st.caption(":blue[Data Cleaning has done for the all the null values and negative values]")
    st.caption(":blue[log transformation is applied for selling price prediction]")
    st.caption(":blue[3 features(quantity tons, width and thickness) has been taken for selling price prediction]")
    st.caption(":blue[Won and lost status has been taken for prediction other status data has been removed]")
    st.caption(":blue[all features has been taken for status prediction except item date and delivery date]")

In [None]:
!npm install localtunnel

In [None]:
!streamlit run /content/app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com