# importing libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# loading Data

In [2]:
df = pd.read_csv("datasets/fabric_full_sustainability_dataset_5000.csv")
df.head(9)

Unnamed: 0,Fabric_Type,Recyclability,Biodegradability,Water_Usage,Carbon_Emissions,Toxicity,Microplastic_Shedding,Durability,Region_Suitability
0,Nylon,7.251198,1.659177,1454.12667,12.838951,5.696168,8.785652,8.199509,Desert
1,Nylon,7.623716,0.741226,4552.053956,11.354955,5.950722,6.870238,9.143675,High Altitude
2,Bamboo Fiber,7.212781,8.011012,4784.081969,3.71435,2.33864,0.0,7.214075,Jungle
3,Kevlar,4.624922,0.0,4322.029713,21.394027,8.769083,2.949565,10.0,War Zones
4,Wool,7.810887,7.086195,11274.876762,6.520993,3.971152,0.981826,6.404292,High Altitude
5,Cotton,8.115782,8.339274,16012.685312,9.780339,3.881174,0.0,4.986087,Hot Desert
6,Wool,8.914461,8.37686,8424.011771,9.968177,2.433226,0.595106,7.452728,Cold Desert
7,Bamboo Fiber,9.533208,7.52284,2235.78267,4.69975,2.355899,0.0,5.616052,Coastal
8,Wool,6.152941,7.021627,5400.989621,9.868856,3.453043,0.524052,6.732101,Cold Desert


# data Cleaning

In [7]:
df.info()
df.describe()
df.isna().sum()
df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Fabric_Type            5000 non-null   object 
 1   Recyclability          5000 non-null   float64
 2   Biodegradability       5000 non-null   float64
 3   Water_Usage            5000 non-null   float64
 4   Carbon_Emissions       5000 non-null   float64
 5   Toxicity               5000 non-null   float64
 6   Microplastic_Shedding  5000 non-null   float64
 7   Durability             5000 non-null   float64
 8   Region_Suitability     5000 non-null   object 
dtypes: float64(7), object(2)
memory usage: 351.7+ KB


np.int64(0)

# Normalize Values

In [19]:
scaler = MinMaxScaler()
df[['Water_Usage', 'Carbon_Emissions']] = scaler.fit_transform(df[['Water_Usage', 'Carbon_Emissions']])
df.head(9)

Unnamed: 0,Fabric_Type,Recyclability,Biodegradability,Water_Usage,Carbon_Emissions,Toxicity,Microplastic_Shedding,Durability,Region_Suitability
0,Nylon,7.251198,1.659177,0.048943,0.471173,5.696168,8.785652,8.199509,Desert
1,Nylon,7.623716,0.741226,0.207865,0.406619,5.950722,6.870238,9.143675,High Altitude
2,Bamboo Fiber,7.212781,8.011012,0.219768,0.074253,2.33864,0.0,7.214075,Jungle
3,Kevlar,4.624922,0.0,0.196065,0.843319,8.769083,2.949565,10.0,War Zones
4,Wool,7.810887,7.086195,0.552742,0.196342,3.971152,0.981826,6.404292,High Altitude
5,Cotton,8.115782,8.339274,0.795789,0.338123,3.881174,0.0,4.986087,Hot Desert
6,Wool,8.914461,8.37686,0.406494,0.346294,2.433226,0.595106,7.452728,Cold Desert
7,Bamboo Fiber,9.533208,7.52284,0.089041,0.117117,2.355899,0.0,5.616052,Coastal
8,Wool,6.152941,7.021627,0.251415,0.341974,3.453043,0.524052,6.732101,Cold Desert


# Main Formula

In [9]:
df['Sustainability_Score'] = (
    0.25 * df['Recyclability'] +
    0.25 * df['Biodegradability'] +
    0.15 * (1 - df['Toxicity'] / 10) +
    0.10 * (1 - df['Microplastic_Shedding'] / 10) +
    0.15 * (1 - df['Carbon_Emissions']) +
    0.10 * (1 - df['Water_Usage'])
) * 10  # Scale to 0-100

# Train a Machine Learning Model

In [None]:
X = df.drop(columns=['Sustainability_Score', 'Fabric_Type', 'Region_Suitability'])
y = df['Sustainability_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.9997077361982352
MSE: 0.062289271892968304


In [11]:
joblib.dump(model, "sustainability_model.pkl")
print("Model saved successfully!")


Model saved successfully!


# test cases

In [35]:
# Test suite for Sustainability model
# Run in same folder as:
# - fabric_full_sustainability_dataset_5000.csv
# - sustainability_model.pkl

import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error

# --------- Config / Tolerance ---------
MODEL_PATH = "pkl_file/sustainability_model.pkl"
DATASET_PATH = "datasets/fabric_full_sustainability_dataset_5000.csv"
ABS_TOL = 1.0   # acceptable absolute error per sample (adjustable)
MAE_TOL = 0.5   # acceptable mean absolute error on full-sample test

# --------- Load model and dataset ---------
model = joblib.load(MODEL_PATH)
# df = pd.read_csv(DATASET_PATH)

# Compute mins/maxes for normalization (same procedure used when labeling)
water_min, water_max = df["Water_Usage"].min(), df["Water_Usage"].max()
carbon_min, carbon_max = df["Carbon_Emissions"].min(), df["Carbon_Emissions"].max()

def normalize_val(x, mn, mx):
    if mx == mn:
        return 0.0
    return (x - mn) / (mx - mn)

# --------- Labeling function (replicates the formula used to create the label) ---------
def expected_sustainability_score(row):
    # Inputs expected in their raw units (same as dataset)
    rec = row["Recyclability"]       # 0-10
    bio = row["Biodegradability"]    # 0-10
    tox = row["Toxicity"]            # 0-10
    shed = row["Microplastic_Shedding"]  # 0-10
    # normalize carbon & water using dataset min/max
    carbon_norm = normalize_val(row["Carbon_Emissions"], carbon_min, carbon_max)
    water_norm = normalize_val(row["Water_Usage"], water_min, water_max)

    score = (
        0.25 * rec +
        0.25 * bio +
        0.15 * (1 - tox / 10.0) +
        0.10 * (1 - shed / 10.0) +
        0.15 * (1 - carbon_norm) +
        0.10 * (1 - water_norm)
    ) * 10.0  # scale to 0-100

    return float(np.round(score, 4))


# --------- Helper: run model prediction for a row dict ---------
def model_predict_from_row(row):
    # model expects features in the same order/name as training X
    # Build the feature vector - drop any non-feature columns if present
    # Here we assume model was trained on columns:
    # ['Recyclability','Biodegradability','Toxicity','Microplastic_Shedding','Carbon_Emissions','Water_Usage','Durability']
    # Adjust if your model used a different set/order.
    features = [
        row.get("Recyclability"),
        row.get("Biodegradability"),
        row.get("Toxicity"),
        row.get("Microplastic_Shedding"),
        row.get("Carbon_Emissions"),
        row.get("Water_Usage"),
        row.get("Durability", 5.0)  # durability is included if used; default 5
    ]
    X = np.array(features).reshape(1, -1)
    pred = model.predict(X)
    # if model was a regressor returning a scalar:
    return float(np.round(pred[0], 4)) if np.isscalar(pred[0]) else float(np.round(pred[0][0],4))


# --------- TEST CASES: edge / typical samples ---------
test_rows = [
    # Best possible (high recyclability & biodegradability, low toxicity, low carbon/water)
    {
        "Recyclability": 10.0, "Biodegradability": 10.0, "Toxicity": 0.0,
        "Microplastic_Shedding": 0.0, "Carbon_Emissions": carbon_min, "Water_Usage": water_min, "Durability": 9.0
    },
    # Worst possible
    {
        "Recyclability": 0.0, "Biodegradability": 0.0, "Toxicity": 10.0,
        "Microplastic_Shedding": 10.0, "Carbon_Emissions": carbon_max, "Water_Usage": water_max, "Durability": 1.0
    },
    # Typical Cotton-like profile
    {
        "Recyclability": 8.0, "Biodegradability": 9.0, "Toxicity": 3.0,
        "Microplastic_Shedding": 0.0, "Carbon_Emissions": 15.0, "Water_Usage": 12000.0, "Durability": 6.0
    },
    # Typical Polyester-like profile
    {
        "Recyclability": 6.0, "Biodegradability": 1.0, "Toxicity": 7.0,
        "Microplastic_Shedding": 8.0, "Carbon_Emissions": 50.0, "Water_Usage": 2000.0, "Durability": 8.0
    },
    # Kevlar-like (high carbon, high durability)
    {
        "Recyclability": 4.0, "Biodegradability": 0.0, "Toxicity": 8.0,
        "Microplastic_Shedding": 3.0, "Carbon_Emissions": 70.0, "Water_Usage": 3500.0, "Durability": 10.0
    }
]

print("Running individual test cases...\n")
for i, r in enumerate(test_rows, 1):
    exp = expected_sustainability_score(r)
    pred = model_predict_from_row(r)
    diff = abs(exp - pred)
    status = "PASS" if diff <= ABS_TOL else "FAIL"
    print(f"Test {i}: expected={exp:.4f}, model={pred:.4f}, diff={diff:.4f} -> {status}")
print("\nIndividual tests complete.\n")


# --------- BULK TEST: compare model vs formula on a random sample of dataset ---------
sample = df.sample(n=500, random_state=42)  # 500-row sample
# compute expected labels via formula
expected = sample.apply(expected_sustainability_score, axis=1).values

# prepare feature matrix for model predictions:
# make sure your feature order matches the model's expected columns (modify if needed)
X_cols = ["Recyclability","Biodegradability","Toxicity","Microplastic_Shedding","Carbon_Emissions","Water_Usage","Durability"]
# If Durability doesn't exist in dataset, create default
if "Durability" not in sample.columns:
    sample["Durability"] = 5.0

X = sample[X_cols].values
preds = model.predict(X)
# if model returns shape (n,1)
if preds.ndim == 2 and preds.shape[1] == 1:
    preds = preds.ravel()

mae = mean_absolute_error(expected, preds)
print(f"Bulk sample MAE = {mae:.6f}")
if mae <= MAE_TOL:
    print("Bulk test: PASS (MAE within tolerance)")
else:
    print("Bulk test: FAIL (MAE above tolerance)")

# --------- ADDITIONAL ROBUSTNESS CHECKS ---------
print("\nRobustness checks:")

# 1) Missing values: expect the model or pipeline to handle/fail gracefully
row_missing = test_rows[0].copy()
row_missing.pop("Recyclability")  # remove required field
try:
    _ = model_predict_from_row(row_missing)
    print("- Missing features: model returned a prediction (check if intended).")
except Exception as e:
    print("- Missing features: model raised error (expected behavior unless you added imputation).")
    print("  Exception:", e)

# 2) Out-of-range values: extremely large/small inputs
row_out_of_range = test_rows[0].copy()
row_out_of_range["Water_Usage"] = water_max * 10
row_out_of_range["Carbon_Emissions"] = carbon_max * 10
try:
    pred_oor = model_predict_from_row(row_out_of_range)
    print(f"- Out-of-range input: model produced prediction {pred_oor:.4f} (inspect if clipping needed).")
except Exception as e:
    print("- Out-of-range input: model raised error. Exception:", e)

print("\nAll tests finished. Adjust ABS_TOL / MAE_TOL if you want stricter/looser passing criteria.")


Running individual test cases...

Test 1: expected=55.0000, model=41.6436, diff=13.3564 -> FAIL
Test 2: expected=0.0000, model=9.1647, diff=9.1647 -> FAIL
Test 3: expected=45.6123, model=37.7159, diff=7.8964 -> FAIL
Test 4: expected=17.4415, model=19.0437, diff=1.6022 -> FAIL
Test 5: expected=8.9096, model=11.5674, diff=2.6578 -> FAIL

Individual tests complete.

Bulk sample MAE = 4.559100
Bulk test: FAIL (MAE above tolerance)

Robustness checks:
- Missing features: model returned a prediction (check if intended).
- Out-of-range input: model produced prediction 41.4415 (inspect if clipping needed).

All tests finished. Adjust ABS_TOL / MAE_TOL if you want stricter/looser passing criteria.




# Testing is Right

In [44]:
# Let's simulate a scenario:
# "How well does heavy Wool perform in a Hot Desert?" (Should be bad)

# 1. Prepare the input data
test_rows =[
    {
       "Recyclability": 8.0, "Biodegradability": 9.0, "Toxicity": 3.0,
        "Microplastic_Shedding": 0.0, "Carbon_Emissions": 15.0, "Water_Usage": 12000.0, "Durability": 6.00
    }
]
# pandas is already imported earlier in the notebook as pd; reuse it
test_df = pd.DataFrame(test_rows)

# scale water & carbon using the same scaler used during training
test_df[['Water_Usage', 'Carbon_Emissions']] = scaler.transform(test_df[['Water_Usage', 'Carbon_Emissions']])

# Ensure features match the training features (names and order).
# Guard against X being overwritten as a numpy array elsewhere in the notebook.
if hasattr(X, "columns"):
    feature_cols = X.columns.tolist()
elif hasattr(model, "feature_names_in_"):
    # scikit-learn stores feature names after fitting
    feature_cols = list(model.feature_names_in_)
else:
    # fallback: explicit feature order used originally for training
    feature_cols = ['Recyclability', 'Biodegradability', 'Toxicity',
                    'Microplastic_Shedding', 'Carbon_Emissions', 'Water_Usage', 'Durability']

# Add any missing features with a sensible default (e.g. Durability = 5.0)
for c in feature_cols:
    if c not in test_df.columns:
        test_df[c] = 5.0

# Reorder columns to exactly match training
test_df = test_df[feature_cols]

# 2. Predict
prediction = model.predict(test_df)
print(f"{prediction[0]:.2f}")

45.91


2