In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("Copy-of-Student-Employability-Datasets.csv")

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data.head()

Unnamed: 0,Name of Student,GENERAL APPEARANCE,MANNER OF SPEAKING,PHYSICAL CONDITION,MENTAL ALERTNESS,SELF-CONFIDENCE,ABILITY TO PRESENT IDEAS,COMMUNICATION SKILLS,certifications,workshops,Student Performance Rating,CLASS,Suggested Job Role
0,Student 1,4,5,4,5,5,5,5,shell programming,cloud computing,5,Employable,Database Developer
1,Student 2,4,4,4,4,4,4,3,machine learning,database security,5,Employable,Portal Administrator
2,Student 3,4,3,3,3,3,3,2,app development,web technologies,5,LessEmployable,Portal Administrator
3,Student 4,3,3,3,2,3,3,3,python,data science,5,LessEmployable,Systems Security Administrator
4,Student 5,4,4,3,3,4,4,3,app development,cloud computing,5,Employable,Business Systems Analyst


In [5]:
data.tail()

Unnamed: 0,Name of Student,GENERAL APPEARANCE,MANNER OF SPEAKING,PHYSICAL CONDITION,MENTAL ALERTNESS,SELF-CONFIDENCE,ABILITY TO PRESENT IDEAS,COMMUNICATION SKILLS,certifications,workshops,Student Performance Rating,CLASS,Suggested Job Role
2977,Student 2996,4,3,3,3,3,3,2,distro making,system designing,5,Employable,Information Technology Auditor
2978,Student 2997,3,4,4,4,4,4,4,r programming,system designing,5,Employable,Project Manager
2979,Student 2998,4,5,4,5,4,4,4,app development,database security,5,Employable,Technical Support
2980,Student 2999,4,4,4,3,4,4,3,distro making,data science,5,LessEmployable,Software Quality Assurance (QA) / Testing
2981,Student 3000,4,4,4,4,3,4,4,information security,data science,5,Employable,Technical Support


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2982 entries, 0 to 2981
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Name of Student             2982 non-null   object
 1   GENERAL APPEARANCE          2982 non-null   int64 
 2   MANNER OF SPEAKING          2982 non-null   int64 
 3   PHYSICAL CONDITION          2982 non-null   int64 
 4   MENTAL ALERTNESS            2982 non-null   int64 
 5   SELF-CONFIDENCE             2982 non-null   int64 
 6   ABILITY TO PRESENT IDEAS    2982 non-null   int64 
 7   COMMUNICATION SKILLS        2982 non-null   int64 
 8   certifications              2982 non-null   object
 9   workshops                   2982 non-null   object
 10  Student Performance Rating  2982 non-null   int64 
 11  CLASS                       2982 non-null   object
 12  Suggested Job Role          2982 non-null   object
dtypes: int64(8), object(5)
memory usage: 303.0+ KB


In [7]:
data.describe()

Unnamed: 0,GENERAL APPEARANCE,MANNER OF SPEAKING,PHYSICAL CONDITION,MENTAL ALERTNESS,SELF-CONFIDENCE,ABILITY TO PRESENT IDEAS,COMMUNICATION SKILLS,Student Performance Rating
count,2982.0,2982.0,2982.0,2982.0,2982.0,2982.0,2982.0,2982.0
mean,4.246814,3.884641,3.972166,3.962777,3.910798,3.813883,3.525486,4.610664
std,0.678501,0.757013,0.744135,0.781982,0.807602,0.73939,0.743881,0.692845
min,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0
25%,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0
50%,4.0,4.0,4.0,4.0,4.0,4.0,3.0,5.0
75%,5.0,4.0,5.0,5.0,5.0,4.0,4.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [8]:
data.shape

(2982, 13)

In [9]:
data.columns

Index(['Name of Student', 'GENERAL APPEARANCE', 'MANNER OF SPEAKING',
       'PHYSICAL CONDITION', 'MENTAL ALERTNESS', 'SELF-CONFIDENCE',
       'ABILITY TO PRESENT IDEAS', 'COMMUNICATION SKILLS', 'certifications',
       'workshops', 'Student Performance Rating', 'CLASS',
       'Suggested Job Role'],
      dtype='object')

In [10]:
print(data.dtypes)


Name of Student               object
GENERAL APPEARANCE             int64
MANNER OF SPEAKING             int64
PHYSICAL CONDITION             int64
MENTAL ALERTNESS               int64
SELF-CONFIDENCE                int64
ABILITY TO PRESENT IDEAS       int64
COMMUNICATION SKILLS           int64
certifications                object
workshops                     object
Student Performance Rating     int64
CLASS                         object
Suggested Job Role            object
dtype: object


In [11]:
data.duplicated().sum()

0

In [12]:
data.isnull().sum()

Name of Student               0
GENERAL APPEARANCE            0
MANNER OF SPEAKING            0
PHYSICAL CONDITION            0
MENTAL ALERTNESS              0
SELF-CONFIDENCE               0
ABILITY TO PRESENT IDEAS      0
COMMUNICATION SKILLS          0
certifications                0
workshops                     0
Student Performance Rating    0
CLASS                         0
Suggested Job Role            0
dtype: int64

In [13]:
data['Name of Student'] = pd.to_numeric(data['Name of Student'], errors='coerce')

In [14]:
label_encoder = LabelEncoder()

In [15]:
for column in data.columns:
    if data[column].dtype == 'O':
        data[column] = label_encoder.fit_transform(data[column])

In [16]:
target_variable = 'CLASS'

In [17]:
numeric_data = data.select_dtypes(include=['number'])

In [18]:
if target_variable not in numeric_data.columns:
    raise ValueError(f"Target variable '{target_variable}' not found in the dataset.")


In [19]:
X = numeric_data.drop([target_variable], axis=1)
y = numeric_data[target_variable]


In [20]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
model = RandomForestClassifier()


In [23]:
model.fit(X_train, y_train)

In [24]:
predictions = model.predict(X_test)

In [25]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.9011725293132329
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92       346
           1       0.90      0.86      0.88       251

    accuracy                           0.90       597
   macro avg       0.90      0.90      0.90       597
weighted avg       0.90      0.90      0.90       597



In [26]:
from sklearn.metrics import confusion_matrix

In [27]:
print("Confusion matrix:\n",confusion_matrix(y_test,predictions))

Confusion matrix:
 [[322  24]
 [ 35 216]]


In [31]:
from sklearn.metrics import f1_score

In [32]:
print("F1_score:\n",f1_score(y_test,predictions))

F1_score:
 0.879837067209776


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


logistic_model = LogisticRegression()


logistic_model.fit(X_train, y_train)


In [34]:
y_pred = model.predict(X_test)

In [35]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9849246231155779
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       332
           1       0.98      0.99      0.98       265

    accuracy                           0.98       597
   macro avg       0.98      0.99      0.98       597
weighted avg       0.98      0.98      0.98       597



In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:

tree_model = DecisionTreeClassifier(random_state=42)


In [38]:

tree_model.fit(X_train, y_train)


In [39]:

y_pred = tree_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.8793969849246231
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       332
           1       0.85      0.88      0.87       265

    accuracy                           0.88       597
   macro avg       0.88      0.88      0.88       597
weighted avg       0.88      0.88      0.88       597



In [40]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [41]:

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


In [42]:

svm_model = SVC(kernel='linear', random_state=42)


In [43]:

svm_model.fit(X_train, y_train)


In [44]:

y_pred = svm_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.5561139028475712
Classification Report:
               precision    recall  f1-score   support

           0       0.56      1.00      0.71       332
           1       0.00      0.00      0.00       265

    accuracy                           0.56       597
   macro avg       0.28      0.50      0.36       597
weighted avg       0.31      0.56      0.40       597

