<a href="https://colab.research.google.com/github/klaxman23/August_pratice/blob/main/Module_14_Case_Study_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Case Study – 1
Domain – Chemical Industry
focus – Classify chemical
Business challenge/requirement
FuPont is a leading chemical company across the globe. The Company is on a CSR (Corporate Social Responsibility) mission. It wants to identify biodegradable products based on a study of the relationships between chemical structure and biodegradation of molecules.
You as an ML expert have to create an ML model to classify the chemical structure as 'Ready BioDegradable' – RB vs 'Not Ready Biodegradable' – NRB
Key issues
Data has lots of attributes and classification could be tricky
Considerations
NONE
Data volume
- Approx 1055 records – file bio-degradabale-data.csv
Fields in Data
• Details in .ipynb notebook
Additional information
- NA
Business benefits
Research can lead FuPont to create truly unique Biodegradable packaging material. This could lead to massive profits in future

In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
FILE_NAME = "bio-degradable-data.csv"

if FILE_NAME in os.listdir():
    df = pd.read_csv(FILE_NAME)
    print("✅ Dataset loaded successfully")
else:
    print("⚠ Dataset not found. Creating synthetic dataset...")

    np.random.seed(42)
    X_dummy = np.random.rand(1055, 41)   # 41 chemical features
    y_dummy = np.random.choice(["RB", "NRB"], size=1055)

    columns = [f"feature_{i}" for i in range(1, 42)]
    df = pd.DataFrame(X_dummy, columns=columns)
    df["target"] = y_dummy

    print("✅ Synthetic dataset created")

print("Dataset Shape:", df.shape)
df.head()


In [None]:
print(df.info())
print("\nMissing values:\n", df.isnull().sum())
print("\nTarget distribution:\n", df.iloc[:, -1].value_counts())


In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

print("Class Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


In [None]:
pca = PCA(n_components=0.95)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("PCA Components:", pca.n_components_)


In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train_pca, y_train)
y_pred_rf = rf.predict(X_test_pca)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:
svm = SVC(kernel="rbf", C=1, gamma="scale")
svm.fit(X_train_pca, y_train)

y_pred_svm = svm.predict(X_test_pca)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


In [None]:
cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – Random Forest")
plt.show()


In [None]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "SVM"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_svm)
    ]
})

results
