In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("/Intern_Performance_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,Intern_ID,Attendance_%,Tasks_Submitted_%,Feedback_Score,Mentor_Interaction,Performance
0,INT0001,78,55,4,Medium,Medium
1,INT0002,91,79,2,Medium,Medium
2,INT0003,68,54,5,Low,Low
3,INT0004,54,53,3,High,Medium
4,INT0005,82,42,1,Low,Low


In [4]:
df.isnull().sum()

Unnamed: 0,0
Intern_ID,0
Attendance_%,0
Tasks_Submitted_%,0
Feedback_Score,0
Mentor_Interaction,0
Performance,0


In [5]:
X = df.drop(["Intern_ID", "Performance"], axis=1)
y = df["Performance"]

In [6]:
#encoding techniques
label_enc = LabelEncoder()
y = label_enc.fit_transform(y)

In [7]:
#preprocessing
categorical = ["Mentor_Interaction"]
numeric = ["Attendance_%", "Tasks_Submitted_%", "Feedback_Score"]

In [8]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical),
    ("num", StandardScaler(), numeric)
])



In [9]:
preprocessor

In [10]:
# Model
model_RF = Pipeline([
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model_RF.fit(X_train, y_train)

# Predict & evaluate
y_pred = model_RF.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))

              precision    recall  f1-score   support

        High       1.00      0.88      0.94        17
         Low       1.00      0.91      0.95        32
      Medium       0.91      1.00      0.95        51

    accuracy                           0.95       100
   macro avg       0.97      0.93      0.95       100
weighted avg       0.95      0.95      0.95       100



In [12]:
model_LR = Pipeline([
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000))
])

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model_LR.fit(X_train, y_train)

# Predict & evaluate
y_pred = model_LR.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))

              precision    recall  f1-score   support

        High       1.00      0.65      0.79        17
         Low       0.92      0.72      0.81        32
      Medium       0.77      0.96      0.85        51

    accuracy                           0.83       100
   macro avg       0.90      0.78      0.81       100
weighted avg       0.85      0.83      0.83       100



In [14]:
clf = model_RF.named_steps["clf"]
importances = clf.feature_importances_
print("Feature Importances:", importances)

Feature Importances: [0.09126991 0.13603334 0.06538969 0.24151407 0.28389161 0.18190137]


In [15]:
# Get feature names back from preprocessing
encoder = preprocessor.named_transformers_['cat']
onehot_features = encoder.get_feature_names_out(["Mentor_Interaction"])
all_features = list(onehot_features) + numeric

# Put into dataframe
feat_imp = pd.DataFrame({
    "Feature": all_features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feat_imp)

                     Feature  Importance
4          Tasks_Submitted_%    0.283892
3               Attendance_%    0.241514
5             Feedback_Score    0.181901
1     Mentor_Interaction_Low    0.136033
0    Mentor_Interaction_High    0.091270
2  Mentor_Interaction_Medium    0.065390
