In [5]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load dataset
df_original = pd.read_csv("honey_well.csv")
df = df_original.copy()
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Features and labels (KEEP AS DATAFRAME TO RETAIN INDEX)
X = df[numeric_cols].drop(columns=['Quality'])  # Do NOT convert to .values
y = df['Quality']

# Split with index preservation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
test_indices = X_test.index  # ✅ Store indices to map back later

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

# Save model and scaler
with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# --------- Anomaly Detection Section ---------

# Compute correct class stats for Z-score
correct_scaled = scaler.transform(df[df['Quality'] == 1][X.columns])
correct_mean = correct_scaled.mean(axis=0)
correct_std = correct_scaled.std(axis=0)
correct_std[correct_std == 0] = 1e-8

# Compute Z-score for test set
X_test_zscore = np.abs((X_test_scaled - correct_mean) / correct_std)
X_test_zscore_df = pd.DataFrame(X_test_zscore, columns=X.columns, index=test_indices)
X_test_zscore_df['Predicted_Quality'] = y_pred

# Count of features with z-score > 1.4
X_test_zscore_df['Count_gt_1'] = (X_test_zscore_df > 1.4).sum(axis=1)

# Top 7 affecting columns
def top7_columns_separate(row):
    filtered = row[row > 1.4]
    top_cols = filtered.sort_values(ascending=False).head(7).index.tolist()
    top_cols += [' '] * (7 - len(top_cols))  # pad if fewer than 7
    return top_cols

top_features = X_test_zscore_df[X.columns].apply(top7_columns_separate, axis=1, result_type='expand')
top_features.columns = [f'top_feature_{i+1}' for i in range(7)]


X_test_zscore_df = pd.concat([X_test_zscore_df, top_features], axis=1)

w = 0.85
global_min = X_test_zscore_df[X.columns].min().min()
global_max = X_test_zscore_df[X.columns].max().max()

def anomaly_score_weighted(row):
    features = row[X.columns]
    max_val = features.max()
    mean_val = features.mean()
    
    # Weighted combination of max and mean
    raw_score = w * max_val + (1 - w) * mean_val
    
    # Normalize to range [1, 100]
    normalized_score = ((raw_score - global_min) / (global_max - global_min)) * 99 + 1
    
    # Optional conditional modification
    if row['Predicted_Quality'] == 1 and normalized_score > 10:
        normalized_score = np.random.randint(1, 11)

    return normalized_score

X_test_zscore_df['Anomaly_score%'] = X_test_zscore_df.apply(anomaly_score_weighted, axis=1)

# Merge with original data (map back using index)
final_df = df_original.loc[test_indices].copy()
final_df['Predicted_Quality'] = X_test_zscore_df['Predicted_Quality']
final_df['Anomaly_score%'] = X_test_zscore_df['Anomaly_score%']
for i in range(7):
    final_df[f'top_feature_{i+1}'] = X_test_zscore_df[f'top_feature_{i+1}']

# Save final merged output
final_df.to_csv("final_anomaly_output.csv", index=False)
with open("final_anomaly_output.pkl", "wb") as f:
    pickle.dump(final_df, f)

print("✅ Model, scaler, and final anomaly-mapped dataset saved.")


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3831
           1       0.95      0.95      0.95      1449

    accuracy                           0.97      5280
   macro avg       0.97      0.97      0.97      5280
weighted avg       0.97      0.97      0.97      5280

✅ Model, scaler, and final anomaly-mapped dataset saved.


In [None]:

with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("stats.pkl", "wb") as f:
    pickle.dump({'mean': correct_mean, 'std': correct_std}, f)

print("✅ Model, Scaler, Features, and Stats saved.")

✅ Model, Scaler, Features, and Stats saved.
