In [11]:

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report


data = pd.read_csv("stud.csv")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

data.head()

data.shape
data.info()
data.isnull().sum()


label_encoder = LabelEncoder()
data['Courses_label'] = label_encoder.fit_transform(data['Courses'])

y = data['Courses_label']
data['Courses_label'].value_counts()


categorical_columns = [
    'Drawing','Dancing','Singing','Sports','Video Game','Acting','Travelling',
    'Gardening','Animals','Photography','Teaching','Exercise','Coding',
    'Electricity Components','Mechanic Parts','Computer Parts','Researching',
    'Architecture','Historic Collection','Botany','Zoology','Physics',
    'Accounting','Economics','Sociology','Geography','Psycology','History',
    'Science','Bussiness Education','Chemistry','Mathematics','Biology',
    'Makeup','Designing','Content writing','Crafting','Literature','Reading',
    'Cartooning','Debating','Asrtology','Hindi','French','English','Urdu',
    'Other Language','Solving Puzzles','Gymnastics','Yoga','Engeeniering',
    'Doctor','Pharmisist','Cycling','Knitting','Director','Journalism',
    'Bussiness','Listening Music'
]

label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


X = data.drop(['Courses', 'Courses_label'], axis=1)


rf_selector = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_selector.fit(X, y)

selector = SelectFromModel(
    rf_selector,
    threshold='median',
    prefit=True
)

X_selected = selector.transform(X.values)


print("Original number of features:", X.shape[1])
print("Selected number of features:", X_selected.shape[1])


X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3535 entries, 0 to 3534
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Drawing                 3535 non-null   int64 
 1   Dancing                 3535 non-null   int64 
 2   Singing                 3535 non-null   int64 
 3   Sports                  3535 non-null   int64 
 4   Video Game              3535 non-null   int64 
 5   Acting                  3535 non-null   int64 
 6   Travelling              3535 non-null   int64 
 7   Gardening               3535 non-null   int64 
 8   Animals                 3535 non-null   int64 
 9   Photography             3535 non-null   int64 
 10  Teaching                3535 non-null   int64 
 11  Exercise                3535 non-null   int64 
 12  Coding                  3535 non-null   int64 
 13  Electricity Components  3535 non-null   int64 
 14  Mechanic Parts          3535 non-null   int64 
 15  Comp

In [12]:

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:

y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9915134370579916
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20
           2       1.00      0.90      0.95        20
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        20
           5       1.00      0.90      0.95        20
           6       1.00      1.00      1.00        21
           7       1.00      1.00      1.00        21
           8       1.00      1.00      1.00        20
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        21
          11       1.00      1.00      1.00        20
          12       1.00      1.00      1.00        20
          13       1.00      1.00      1.00        20
          14       1.00      1.00      1.00        21
          15       1.00      1.00      1.00        20
          16       1.00      1.00      1.00        2

In [20]:


mcq_careers = [ "Dancing", "Electrician"]

print("MCQ RESULT")
print("Personality Type: Realistic")
print("MCQ Career Suggestions:", mcq_careers)

mcq_student = pd.DataFrame(
    np.random.randint(0, 2, size=(1, X.shape[1])),
    columns=X.columns
)


mcq_student_selected = selector.transform(mcq_student.values)


probs = rf_model.predict_proba(mcq_student_selected)[0]
all_courses = label_encoder.inverse_transform(rf_model.classes_)


final_recommendations = []

for course, prob in zip(all_courses, probs):
    for mcq in mcq_careers:
        if mcq.lower() in course.lower():
            final_recommendations.append((course, prob))


print("\nFINAL CAREER RECOMMENDATIONS (ML + MCQ):")

if final_recommendations:
    final_recommendations = sorted(
        final_recommendations, key=lambda x: x[1], reverse=True
    )
    for course, prob in final_recommendations:
        print(f"{course}  (Confidence: {prob:.2f})")
else:
    print("No direct MCQ match found.")
    print("Showing top ML-based career predictions instead:\n")

    top_indices = probs.argsort()[-5:][::-1]
    for i in top_indices:
        print(f"{all_courses[i]}  (Confidence: {probs[i]:.2f})")


MCQ RESULT
Personality Type: Realistic
MCQ Career Suggestions: ['Dancing', 'Electrician']

FINAL CAREER RECOMMENDATIONS (ML + MCQ):
No direct MCQ match found.
Showing top ML-based career predictions instead:

B.Sc.- Physics  (Confidence: 0.20)
Integrated Law Course- BA + LL.B  (Confidence: 0.17)
B.Tech.-Computer Science and Engineering  (Confidence: 0.16)
B.Tech.-Civil Engineering  (Confidence: 0.13)
B.Tech.-Electrical and Electronics Engineering  (Confidence: 0.13)
