In [1]:
import numpy as np
import pandas as pd

**Importing Dataset**

In [2]:
df = pd.read_csv('data.csv')

**DATA Info & Checking null values**

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [4]:
df.isnull().any()

sl_no             False
gender            False
ssc_p             False
ssc_b             False
hsc_p             False
hsc_b             False
hsc_s             False
degree_p          False
degree_t          False
workex            False
etest_p           False
specialisation    False
mba_p             False
status            False
salary             True
dtype: bool

**Fill null values if there is any**

In [5]:
df = df.fillna(method='ffill')

In [6]:
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,250000.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


**Preprocessing the Dataset**

In [7]:
df.specialisation.unique()

array(['Mkt&HR', 'Mkt&Fin'], dtype=object)

In [8]:
df['gender'].replace(['M', 'F'], [1, 0], inplace=True)
df['ssc_b'].replace(['Others', 'Central'], [1, 2], inplace=True)
df['hsc_b'].replace(['Others', 'Central'], [1, 2], inplace=True)
df['hsc_s'].replace(['Commerce', 'Science', 'Arts'], [1, 3, 2], inplace=True)
df['degree_t'].replace(['Sci&Tech', 'Comm&Mgmt', 'Others'], [3, 2, 1], inplace=True)
df['workex'].replace(['Yes', 'No'], [1, 0], inplace=True)
df['specialisation'].replace(['Mkt&HR', 'Mkt&Fin'], [2, 1], inplace=True)
df['status'].replace(['Placed', 'Not Placed'], [1, 0], inplace=True)

In [9]:
X = df.iloc[:, range(1, 13)].values

y = df.iloc[:, 13].values

**Splitting Training and Testing Data Entries**

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

**Standardizing Data**

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**K Nearest Neighbors Classification**

In [12]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 14, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)


KNeighborsClassifier(n_neighbors=14)

In [13]:
y_pred_knn = classifier.predict(X_test)

In [14]:
print("Actual Result: ", y_test)
print("Predicted Result: ", y_pred_knn)

Actual Result:  [0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1]
Predicted Result:  [1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1]


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error, r2_score

In [16]:
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)

[[ 7 10]
 [ 0 37]]


In [17]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       1.00      0.41      0.58        17
           1       0.79      1.00      0.88        37

    accuracy                           0.81        54
   macro avg       0.89      0.71      0.73        54
weighted avg       0.85      0.81      0.79        54



**Evaluation Metrics**

In [18]:
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_knn))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_knn))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred_knn)))
r2 = r2_score(y_test, y_pred_knn)
print("R2 Score: ", r2)
print("Root Mean Squared Log Error: ",np.log(np.sqrt(mean_squared_error(y_test,y_pred_knn))))
n = 54
k = 12
adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
print("Adjusted R2: ", adj_r2_score)

Mean Squared Error:  0.18518518518518517
Mean Absolute Error:  0.18518518518518517
Root Mean Squared Error:  0.4303314829119352
R2 Score:  0.14149443561208286
Root Mean Squared Log Error:  -0.8431994767851144
Adjusted R2:  -0.10977548567218554


**Naive Bayes Classification**

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
nbclassifier = GaussianNB()
nbclassifier.fit(X_train, y_train)

GaussianNB()

In [21]:
y_pred_nb = nbclassifier.predict(X_test)

In [22]:
print("Actual Result: ", y_test)
print("Predicted Result: ", y_pred_nb)

Actual Result:  [0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1]
Predicted Result:  [1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1
 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1]


In [23]:
print(confusion_matrix(y_test, y_pred_nb))

[[11  6]
 [ 5 32]]


In [24]:
print(classification_report(y_test,y_pred_nb))

              precision    recall  f1-score   support

           0       0.69      0.65      0.67        17
           1       0.84      0.86      0.85        37

    accuracy                           0.80        54
   macro avg       0.76      0.76      0.76        54
weighted avg       0.79      0.80      0.79        54



**Evaluation Metrics**

In [25]:
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_nb))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_nb))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred_nb)))
r2 = r2_score(y_test, y_pred_nb)
print("R2 Score: ", r2)
print("Root Mean Squared Log Error: ",np.log(np.sqrt(mean_squared_error(y_test,y_pred_nb))))
n = 54
k = 12
adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
print("Adjusted R2: ", adj_r2_score)

Mean Squared Error:  0.2037037037037037
Mean Absolute Error:  0.2037037037037037
Root Mean Squared Error:  0.45133546692422
R2 Score:  0.055643879173291166
Root Mean Squared Log Error:  -0.7955443868829519
Adjusted R2:  -0.22075303423940418


**Random Forest Classification**

In [26]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [27]:
y_pred_rf = classifier.predict(X_test)

In [28]:
print("Actual Result: ", y_test)
print("Predicted Result: ", y_pred_rf)

Actual Result:  [0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1]
Predicted Result:  [1 1 1 0 1 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [29]:
print(confusion_matrix(y_test, y_pred_rf))

[[ 7 10]
 [ 5 32]]


In [30]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.58      0.41      0.48        17
           1       0.76      0.86      0.81        37

    accuracy                           0.72        54
   macro avg       0.67      0.64      0.65        54
weighted avg       0.71      0.72      0.71        54



**Evaluation Metrics**

In [31]:
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_rf))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_rf))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
r2 = r2_score(y_test, y_pred_rf)
print("R2 Score: ", r2)
print("Root Mean Squared Log Error: ",np.log(np.sqrt(mean_squared_error(y_test,y_pred_rf))))
n = 54
k = 12
adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
print("Adjusted R2: ", adj_r2_score)

Mean Squared Error:  0.2777777777777778
Mean Absolute Error:  0.2777777777777778
Root Mean Squared Error:  0.5270462766947299
R2 Score:  -0.2877583465818758
Root Mean Squared Log Error:  -0.6404669227310321
Adjusted R2:  -0.6646632285082783


**Logistic Regression**

In [32]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [33]:
y_pred_lr = classifier.predict(X_test)

In [34]:
print("Actual Result: ", y_test)
print("Predicted Result: ", y_pred_lr)

Actual Result:  [0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1]
Predicted Result:  [0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1]


In [35]:
print(confusion_matrix(y_test, y_pred_lr))

[[11  6]
 [ 3 34]]


In [36]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.79      0.65      0.71        17
           1       0.85      0.92      0.88        37

    accuracy                           0.83        54
   macro avg       0.82      0.78      0.80        54
weighted avg       0.83      0.83      0.83        54



**Evaluation Metrics**

In [37]:
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred_lr))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_lr))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
r2 = r2_score(y_test, y_pred_lr)
print("R2 Score: ", r2)
print("Root Mean Squared Log Error: ",np.log(np.sqrt(mean_squared_error(y_test,y_pred_lr))))
n = 54
k = 12
adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
print("Adjusted R2: ", adj_r2_score)

Mean Squared Error:  0.16666666666666666
Mean Absolute Error:  0.16666666666666666
Root Mean Squared Error:  0.408248290463863
R2 Score:  0.22734499205087455
Root Mean Squared Log Error:  -0.8958797346140275
Adjusted R2:  0.001202062895032996


# **Among these models Logistic Regression is the best model as it has the high Accuracy Metric**