## MACHINE LEARNING MODEL FOR FOOD ADULTERATION ANALYSIS

In [None]:
import warnings
warnings.filterwarnings('ignore')
import joblib
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score,recall_score,f1_score

In [None]:
df=pd.read_csv('D-13 FOOD ADULTERATION.csv')

In [None]:
df.head()

## Data preprocessing:

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.nunique()

## Data cleaning:

In [None]:
# to check the null values:
df.isnull().sum()

In [None]:
# to check the duplicate values:
df.duplicated().sum()

In [None]:
df = resample(df, replace=True, n_samples=16500, random_state=42)


## Exploratory data anlaysis:

In [None]:
# count plot for adulterant column:
sns.set(style="darkgrid")
plt.figure(figsize=(8,6))
ax=sns.countplot(x='action_taken',data=df)
plt.title('count plot')
plt.xlabel('adulterant')
plt.ylabel('count')
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                textcoords='offset points')
plt.show()


In [None]:
df['action_taken'].unique()

In [None]:
labels=['Fine Imposed', 'Warning Issued', 'Product Recall',
       'Investigation Launched']

## Label encoding:

In [None]:
df.head()

In [None]:
# to convert the object type columns to numerical columns:
le=LabelEncoder()

In [None]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# x and y variable separation:
x = df.drop(columns=['action_taken'])

In [None]:
x

In [None]:
y = df.loc[:,'action_taken']

In [None]:
y

In [None]:
scaler = StandardScaler()

In [None]:
x_scaled = scaler.fit_transform(x)

In [None]:
# train, test and splitting of data:

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.2,random_state=42) 

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

## Applying SMOTE:

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

In [None]:
print("Original training set shape:", x_train.shape, y_train.shape)
print("Resampled training set shape:", x_resampled.shape, y_resampled.shape)

In [None]:
df_resampled = pd.DataFrame(x_resampled, columns=df.columns[:-1])
df_resampled['target'] = y_resampled

# Count the occurrences of each class
class_counts = df_resampled['target'].value_counts()
print("Class distribution after applying SMOTE:\n", class_counts)

In [None]:
# Visualize the class distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Class Distribution After Applying SMOTE')
plt.show()


## Performance Metrics Evaluation:

In [None]:
#defining global variables to store accuracy and other metrics
precision = []
recall = []
fscore = []
accuracy = []

In [None]:
#function to calculate various metrics such as accuracy, precision etc
def calculateMetrics(algorithm, testY,predict):
    testY = testY.astype('int')
    predict = predict.astype('int')
    p = precision_score(testY, predict,average='macro') * 100
    r = recall_score(testY, predict,average='macro') * 100
    f = f1_score(testY, predict,average='macro') * 100
    a = accuracy_score(testY,predict)*100 
    accuracy.append(a)
    precision.append(p)
    recall.append(r)
    fscore.append(f)
    print(algorithm+' Accuracy    : '+str(a))
    print(algorithm+' Precision   : '+str(p))
    print(algorithm+' Recall      : '+str(r))
    print(algorithm+' FSCORE      : '+str(f))
    report=classification_report(predict, testY,target_names=labels)
    print('\n',algorithm+" classification report\n",report)
    conf_matrix = confusion_matrix(testY, predict) 
    plt.figure(figsize =(5, 5)) 
    ax = sns.heatmap(conf_matrix, xticklabels = labels, yticklabels = labels, annot = True, cmap="Blues" ,fmt ="g");
    ax.set_ylim([0,len(labels)])
    plt.title(algorithm+" Confusion matrix") 
    plt.ylabel('True class') 
    plt.xlabel('Predicted class') 
    plt.show()

## Gradient Boost Classifier:

In [None]:
#gbc model:
if os.path.exists('GradientBoostClassifier.pkl'):
    #load the model:
    gbc=joblib.load('GradientBoostClassifier.pkl')
    print('model loaded successfully')
    predict=gbc.predict(x_test)
    calculateMetrics("KNeighborsClassifier",predict,y_test)
else:
    #train the model:
    gbc = GradientBoostingClassifier(n_estimators=1, learning_rate=0.74, max_depth=0.43, random_state=42,)
    gbc.fit(x_resampled,y_resampled)
    #saving model:
    joblib.dump(gbc,'GradientBoostClassifier.pkl')
    print('model saved successfully')
    predict=gbc.predict(x_test)
    calculateMetrics("GradientBoostClassifier",predict,y_test)

## Knn classifier:

In [None]:
#knn model:
if os.path.exists('KNeighborsClassifier.pkl'):
    #load the model:
    knn=joblib.load('KNeighborsClassifier.pkl')
    print('model loaded successfully')
    predict=knn.predict(x_test)
    calculateMetrics("KNeighborsClassifier",predict,y_test)
else:
    #train the model:
    knn=KNeighborsClassifier()
    knn.fit(x_resampled,y_resampled)
    #saving model:
    joblib.dump(knn,'KNeighborsClassifier.pkl')
    print('model saved successfully')
    predict=knn.predict(x_test)
    calculateMetrics("KNeighborsClassifier",predict,y_test)

In [None]:
#showing all algorithms performance values
columns = ["Algorithm Name","Precison","Recall","FScore","Accuracy"]
values = []
algorithm_names = ["Gradient Boost Classifier",'KNeighbor Classifier']
for i in range(len(algorithm_names)):
    values.append([algorithm_names[i],precision[i],recall[i],fscore[i],accuracy[i]])
    
temp = pd.DataFrame(values,columns=columns)
temp

In [None]:
# importing test dataset:
test=pd.read_csv("test.csv")

In [None]:
test = test.drop('action_taken',axis=1)

In [None]:
test.head()

In [None]:
test.info()

In [None]:
labels

## Label encoding

In [None]:
for column in test.columns:
    if test[column].dtype == 'object':
        test[column] = le.fit_transform(test[column])

In [None]:
# Make predictions on the selected test data
predict = knn.predict(test)
# Loop through each prediction and print the corresponding row
for i, p in enumerate(predict):
    if p == 0:
        print(test.iloc[i])
        print("Row {}:************************************************** Fine Imposed ".format(i))
    elif p==1:
        print(test.iloc[i])
        print("Row {}:************************************************** Warning Issued".format(i))
    elif p==2:
        print(test.iloc[i])
        print("Row {}:************************************************** Product Recall".format(i))
    elif p==3:
        print(test.iloc[i])
        print("Row {}:************************************************** Investigation Launched".format(i))
   
   

In [None]:
test['predict']=predict

In [None]:
test

In [None]:
mapping = {0: 'Fine Imposed', 1: 'Warning Issued', 2: 'Product Recall', 3: 'Investigation Launched'}

In [None]:
test['predict'] = test['predict'].map(mapping)

In [None]:
test