In [1]:
#Data Fetching

In [2]:
print("Created by Kunal")

Created by Kunal


In [3]:
import pandas as pd

In [7]:
df = pd.read_csv("twin_alumania_chatbot_dataset_v2.csv")

In [8]:
df.head

<bound method NDFrame.head of                                              question  \
0            How can I prepare for campus placements?   
1   What is the best way to find internships durin...   
2   Should I go for higher studies in India or abr...   
3        How do I build a strong resume as a fresher?   
4              How to get funding for a startup idea?   
..                                                ...   
95  Sample student question 96: How can I improve ...   
96  Sample student question 97: How can I improve ...   
97  Sample student question 98: How can I improve ...   
98  Sample student question 99: How can I improve ...   
99  Sample student question 100: How can I improve...   

                                               answer        category  \
0   Start preparing with aptitude tests, mock inte...       Placement   
1   Use LinkedIn, Internshala, and your college pl...      Internship   
2   It depends on your career goals. Abroad gives ...  Higher Stud

In [15]:
# ------------------------------------------------------------------
# Twin Aluminai — Full Workflow with Model Export (Pickle)
# ------------------------------------------------------------------

import numpy as np
import pandas as pd
import os
import pickle
from datetime import datetime

# Plotting
import matplotlib.pyplot as plt

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------------------------
# 0) Utility / Settings
# ------------------------------------------------------------------

SAVE_DIR = "/mnt/data"
os.makedirs(SAVE_DIR, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ------------------------------------------------------------------
# 1) Generate Synthetic "Twin Aluminai" Dataset
# ------------------------------------------------------------------

N = 1000  # number of samples

# Feature set (10 core numeric + 2 categorical)
data = pd.DataFrame({
    "age": np.random.randint(18, 60, size=N),
    "experience_years": np.random.randint(0, 30, size=N),
    "stress_level": np.random.uniform(0, 1, size=N),
    "time_pressure": np.random.uniform(0, 1, size=N),
    "task_complexity": np.random.uniform(0, 1, size=N),
    "resource_availability": np.random.uniform(0, 1, size=N),
    "risk_tolerance": np.random.uniform(0, 1, size=N),
    "collaboration_level": np.random.uniform(0, 1, size=N),
    "reliability_score": np.random.uniform(0, 1, size=N),
    "preference_score": np.random.uniform(0, 1, size=N),
})

# Categorical columns
scenarios = ["Procurement", "Scheduling", "QualityCheck", "CustomerSupport", "Inventory"]
regions = ["North", "South", "East", "West", "Central"]
data["scenario"] = np.random.choice(scenarios, size=N, replace=True)
data["region"] = np.random.choice(regions, size=N, replace=True)

# Synthetic label (decision 0/1)
logit = (
    0.02 * data["age"] +
    0.08 * data["experience_years"] -
    1.2 * data["stress_level"] -
    0.9 * data["time_pressure"] -
    0.7 * data["task_complexity"] +
    1.1 * data["resource_availability"] +
    0.6 * data["risk_tolerance"] +
    0.9 * data["collaboration_level"] +
    1.0 * data["reliability_score"] +
    0.7 * data["preference_score"] +
    1.5 * (data["resource_availability"] * data["reliability_score"]) -
    1.1 * (data["stress_level"] * data["task_complexity"]) +
    np.random.normal(0, 0.5, size=N)
)
prob = 1 / (1 + np.exp(-logit))
data["decision"] = (prob > 0.5).astype(int)

# Save CSV
csv_path = os.path.join(SAVE_DIR, "twin_aluminai_dataset.csv")
data.to_csv(csv_path, index=False)
print(f"Dataset saved at: {csv_path}")

# ------------------------------------------------------------------
# 2) Preprocessing & Train/Test Split
# ------------------------------------------------------------------

X = data.drop(columns=["decision"])
y = data["decision"]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# ------------------------------------------------------------------
# 3) Baseline Models
# ------------------------------------------------------------------

# Logistic Regression
log_reg_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500, random_state=RANDOM_STATE))
])

# Random Forest
rf_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
])

# Cross-validated accuracy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
lr_cv_acc = cross_val_score(log_reg_clf, X_train, y_train, cv=cv, scoring="accuracy").mean()
rf_cv_acc = cross_val_score(rf_clf, X_train, y_train, cv=cv, scoring="accuracy").mean()

# Fit models
log_reg_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

# ------------------------------------------------------------------
# 4) Evaluation Helper
# ------------------------------------------------------------------

def evaluate_model(pipe, X_test, y_test, name="model"):
    y_pred = pipe.predict(X_test)
    try:
        y_prob = pipe.predict_proba(X_test)[:, 1]
    except:
        y_prob = None
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"--- {name} ---")
    print("Accuracy:", acc)
    if roc is not None:
        print("ROC AUC:", roc)
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)
    return y_pred, y_prob

# Evaluate baselines
_ = evaluate_model(log_reg_clf, X_test, y_test, "LogisticRegression")
_ = evaluate_model(rf_clf, X_test, y_test, "RandomForest")

# ------------------------------------------------------------------
# 5) Hyperparameter Tuning for Random Forest
# ------------------------------------------------------------------

param_dist = {
    "clf__n_estimators": [200, 300, 400, 500],
    "clf__max_depth": [None, 5, 10, 15],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", None]
}

rf_tune = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=RANDOM_STATE))
])

random_search = RandomizedSearchCV(
    rf_tune,
    param_distributions=param_dist,
    n_iter=20,
    scoring="accuracy",
    n_jobs=-1,
    cv=cv,
    random_state=RANDOM_STATE,
    verbose=0
)

random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_
print("Best RF Params:", random_search.best_params_)

_ = evaluate_model(best_rf, X_test, y_test, "RandomForest_Tuned")

# ------------------------------------------------------------------
# 6) Save Trained Model using Pickle
# ------------------------------------------------------------------

model_path = os.path.join(SAVE_DIR, "twin_aluminai_model.pkl")
with open(model_path, "wb") as f:
    pickle.dump(best_rf, f)

print(f"Trained model exported successfully at: {model_path}")

# ------------------------------------------------------------------
# 7) Optional: Generate Charts (Class Balance, ROC Curve)
# ------------------------------------------------------------------

# Class balance
plt.figure()
data["decision"].value_counts().plot(kind="bar")
plt.title("Class Balance")
plt.xlabel("Decision")
plt.ylabel("Count")
plt.savefig(os.path.join(SAVE_DIR, "class_balance.png"))
plt.close()

# ROC Curve for best model
try:
    y_prob_best = best_rf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob_best)
    plt.figure()
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.title("ROC Curve (RandomForest Tuned)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.savefig(os.path.join(SAVE_DIR, "roc_curve.png"))
    plt.close()
except:
    pass

print("Charts saved successfully.")

Dataset saved at: /mnt/data\twin_aluminai_dataset.csv
--- LogisticRegression ---
Accuracy: 0.965
ROC AUC: 0.9610829103214891
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.98      0.98      0.98       197

    accuracy                           0.96       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.97      0.96      0.97       200

Confusion Matrix:
 [[  0   3]
 [  4 193]]
--- RandomForest ---
Accuracy: 0.985
ROC AUC: 0.9771573604060914
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.98      1.00      0.99       197

    accuracy                           0.98       200
   macro avg       0.49      0.50      0.50       200
weighted avg       0.97      0.98      0.98       200

Confusion Matrix:
 [[  0   3]
 [  0 197]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best RF Params: {'clf__n_estimators': 200, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 4, 'clf__max_features': None, 'clf__max_depth': 15}
--- RandomForest_Tuned ---
Accuracy: 0.985
ROC AUC: 0.9407783417935702
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.98      1.00      0.99       197

    accuracy                           0.98       200
   macro avg       0.49      0.50      0.50       200
weighted avg       0.97      0.98      0.98       200

Confusion Matrix:
 [[  0   3]
 [  0 197]]
Trained model exported successfully at: /mnt/data\twin_aluminai_model.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Charts saved successfully.


In [16]:
# ------------------------------------------------------------------
# Twin Aluminai — Full Workflow with Model Export (Joblib)
# ------------------------------------------------------------------

import numpy as np
import pandas as pd
import os
import joblib
from datetime import datetime

# Plotting
import matplotlib.pyplot as plt

# ML
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------------------------
# 0) Utility / Settings
# ------------------------------------------------------------------

SAVE_DIR = "/mnt/data"
os.makedirs(SAVE_DIR, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ------------------------------------------------------------------
# 1) Generate Synthetic "Twin Aluminai" Dataset
# ------------------------------------------------------------------

N = 1000  # number of samples

# Feature set (10 core numeric + 2 categorical)
data = pd.DataFrame({
    "age": np.random.randint(18, 60, size=N),
    "experience_years": np.random.randint(0, 30, size=N),
    "stress_level": np.random.uniform(0, 1, size=N),
    "time_pressure": np.random.uniform(0, 1, size=N),
    "task_complexity": np.random.uniform(0, 1, size=N),
    "resource_availability": np.random.uniform(0, 1, size=N),
    "risk_tolerance": np.random.uniform(0, 1, size=N),
    "collaboration_level": np.random.uniform(0, 1, size=N),
    "reliability_score": np.random.uniform(0, 1, size=N),
    "preference_score": np.random.uniform(0, 1, size=N),
})

# Categorical columns
scenarios = ["Procurement", "Scheduling", "QualityCheck", "CustomerSupport", "Inventory"]
regions = ["North", "South", "East", "West", "Central"]
data["scenario"] = np.random.choice(scenarios, size=N, replace=True)
data["region"] = np.random.choice(regions, size=N, replace=True)

# Synthetic label (decision 0/1)
logit = (
    0.02 * data["age"] +
    0.08 * data["experience_years"] -
    1.2 * data["stress_level"] -
    0.9 * data["time_pressure"] -
    0.7 * data["task_complexity"] +
    1.1 * data["resource_availability"] +
    0.6 * data["risk_tolerance"] +
    0.9 * data["collaboration_level"] +
    1.0 * data["reliability_score"] +
    0.7 * data["preference_score"] +
    1.5 * (data["resource_availability"] * data["reliability_score"]) -
    1.1 * (data["stress_level"] * data["task_complexity"]) +
    np.random.normal(0, 0.5, size=N)
)
prob = 1 / (1 + np.exp(-logit))
data["decision"] = (prob > 0.5).astype(int)

# Save CSV
csv_path = os.path.join(SAVE_DIR, "twin_aluminai_dataset.csv")
data.to_csv(csv_path, index=False)
print(f"Dataset saved at: {csv_path}")

# ------------------------------------------------------------------
# 2) Preprocessing & Train/Test Split
# ------------------------------------------------------------------

X = data.drop(columns=["decision"])
y = data["decision"]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# ------------------------------------------------------------------
# 3) Baseline Models
# ------------------------------------------------------------------

# Logistic Regression
log_reg_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500, random_state=RANDOM_STATE))
])

# Random Forest
rf_clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
])

# Cross-validated accuracy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
lr_cv_acc = cross_val_score(log_reg_clf, X_train, y_train, cv=cv, scoring="accuracy").mean()
rf_cv_acc = cross_val_score(rf_clf, X_train, y_train, cv=cv, scoring="accuracy").mean()

# Fit models
log_reg_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

# -------------------------------------------------------

Dataset saved at: /mnt/data\twin_aluminai_dataset.csv
