In [3]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report


# ==================================================
# 1. LOAD DATASET
# ==================================================
data = pd.read_csv("stud.csv")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("Dataset loaded successfully")
print("Shape:", data.shape)


# ==================================================
# 2. TARGET ENCODING
# ==================================================
label_encoder = LabelEncoder()
data['Courses_label'] = label_encoder.fit_transform(data['Courses'])

y = data['Courses_label']
print("\nTarget Classes:", data['Courses'].unique())


# ==================================================
# 3. FEATURE ENCODING
# ==================================================
categorical_columns = [
    'Drawing','Dancing','Singing','Sports','Video Game','Acting','Travelling',
    'Gardening','Animals','Photography','Teaching','Exercise','Coding',
    'Electricity Components','Mechanic Parts','Computer Parts','Researching',
    'Architecture','Historic Collection','Botany','Zoology','Physics',
    'Accounting','Economics','Sociology','Geography','Psycology','History',
    'Science','Bussiness Education','Chemistry','Mathematics','Biology',
    'Makeup','Designing','Content writing','Crafting','Literature','Reading',
    'Cartooning','Debating','Asrtology','Hindi','French','English','Urdu',
    'Other Language','Solving Puzzles','Gymnastics','Yoga','Engeeniering',
    'Doctor','Pharmisist','Cycling','Knitting','Director','Journalism',
    'Bussiness','Listening Music'
]

label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


# ==================================================
# 4. FEATURE MATRIX
# ==================================================
X = data.drop(['Courses', 'Courses_label'], axis=1)


# ==================================================
# 5. FEATURE SELECTION USING RANDOM FOREST
# ==================================================
rf_selector = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_selector.fit(X, y)

selector = SelectFromModel(
    rf_selector,
    threshold='median',
    prefit=True
)

X_selected = selector.transform(X.values)

print("\nOriginal number of features:", X.shape[1])
print("Selected number of features:", X_selected.shape[1])


# ==================================================
# 6. TRAIN-TEST SPLIT
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ==================================================
# 7. TRAIN FINAL RANDOM FOREST MODEL
# ==================================================
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)


# ==================================================
# 8. MODEL EVALUATION
# ==================================================
y_pred = rf_model.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


# ==================================================
# 9. SAVE MODEL & OBJECTS USING JOBLIB
# ==================================================
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(selector, "selector.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

print("\nModel and supporting files saved successfully!")


# ==================================================
# 10. PREDICTION FUNCTION (CALLED BY MCQ FILE)
# ==================================================
def predict_from_mcq(mcq_careers):
    """
    mcq_careers: list like ["Dancing", "Electrician"]
    returns: list of (course, confidence)
    """

    # Create dummy student input (same structure as training data)
    mcq_student = pd.DataFrame(
        np.random.randint(0, 2, size=(1, len(X.columns))),
        columns=X.columns
    )

    # Apply feature selection
    mcq_student_selected = selector.transform(mcq_student.values)

    # Predict probabilities
    probs = rf_model.predict_proba(mcq_student_selected)[0]
    all_courses = label_encoder.inverse_transform(rf_model.classes_)

    final_recommendations = []

    # Filter ML predictions using MCQ careers
    for course, prob in zip(all_courses, probs):
        for mcq in mcq_careers:
            if mcq.lower() in course.lower():
                final_recommendations.append((course, prob))

    # Fallback: show top ML predictions
    if not final_recommendations:
        top_indices = probs.argsort()[-5:][::-1]
        final_recommendations = [
            (all_courses[i], probs[i]) for i in top_indices
        ]

    return final_recommendations


Dataset loaded successfully
Shape: (3535, 60)

Target Classes: ['BBA- Bachelor of Business Administration'
 'BEM- Bachelor of Event Management' 'Integrated Law Course- BA + LL.B'
 'BJMC- Bachelor of Journalism and Mass Communication'
 'BFD- Bachelor of Fashion Designing' 'BBS- Bachelor of Business Studies'
 'BTTM- Bachelor of Travel and Tourism Management'
 'BVA- Bachelor of Visual Arts' 'BA in History'
 'B.Arch- Bachelor of Architecture'
 'BCA- Bachelor of Computer Applications' 'B.Sc.- Information Technology'
 'B.Sc- Nursing' 'BPharma- Bachelor of Pharmacy'
 'BDS- Bachelor of Dental Surgery' 'Animation, Graphics and Multimedia'
 'B.Sc- Applied Geology' 'B.Sc.- Physics' 'B.Sc. Chemistry'
 'B.Sc. Mathematics' 'B.Tech.-Civil Engineering'
 'B.Tech.-Computer Science and Engineering'
 'B.Tech.-Electronics and Communication Engineering'
 'B.Tech.-Electrical and Electronics Engineering'
 'B.Tech.-Mechanical Engineering' 'B.Com- Bachelor of Commerce'
 'BA in Economics' 'CA- Chartered Accounta