### Step 1: Import the required modules.

In [1]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Step 2: Now to read the dataset that we are going to use for the analysis and then checking the dataset.

In [2]:
# reading the file
dataset = pd.read_csv('Placement_Data_Full_Class.csv')
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.00,Mkt&HR,58.80,Placed
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.50,Mkt&Fin,66.28,Placed
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.00,Mkt&Fin,57.80,Placed
3,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.80,Mkt&Fin,55.50,Placed
4,8,M,82.00,Central,64.00,Central,Science,66.00,Sci&Tech,Yes,67.00,Mkt&Fin,62.14,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,199,F,67.00,Central,70.00,Central,Commerce,65.00,Others,No,88.00,Mkt&HR,71.96,Not Placed
211,202,M,54.20,Central,63.00,Others,Science,58.00,Comm&Mgmt,No,79.00,Mkt&HR,58.44,Not Placed
212,207,M,41.00,Central,42.00,Central,Science,60.00,Comm&Mgmt,No,97.00,Mkt&Fin,53.39,Not Placed
213,209,F,43.00,Central,60.00,Others,Science,65.00,Comm&Mgmt,No,92.66,Mkt&HR,62.92,Not Placed


### Step 3: Now we will drop the columns that are not needed.

In [10]:

# dropping the serial no and salary col
# dataset = dataset.drop('sl_no', axis=1)
# dataset = dataset.drop('salary', axis=1)

In [9]:
print(dataset.columns)

Index(['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'CGPA',
       'degree_t', 'workex', 'etest_p', 'specialisation', 'Masters', 'status'],
      dtype='object')


### Step 4: Now before moving forward we need to pre-process and transform our data. For that, we will use astype() method on some columns and change the datatype to category.

In [11]:
# catgorising col for further labelling
dataset["gender"] = dataset["gender"].astype('category')
dataset["ssc_b"] = dataset["ssc_b"].astype('category')
dataset["hsc_b"] = dataset["hsc_b"].astype('category')
dataset["degree_t"] = dataset["degree_t"].astype('category')
dataset["workex"] = dataset["workex"].astype('category')
dataset["specialisation"] = dataset["specialisation"].astype('category')
dataset["status"] = dataset["status"].astype('category')
dataset["hsc_s"] = dataset["hsc_s"].astype('category')
dataset.dtypes


gender            category
ssc_p              float64
ssc_b             category
hsc_p              float64
hsc_b             category
hsc_s             category
CGPA               float64
degree_t          category
workex            category
etest_p            float64
specialisation    category
Masters            float64
status            category
dtype: object

### Step 5: Now we will apply codes on some of these columns to convert their text values to numerical values.


In [12]:
# labelling the columns
dataset["gender"] = dataset["gender"].cat.codes
dataset["ssc_b"] = dataset["ssc_b"].cat.codes
dataset["hsc_b"] = dataset["hsc_b"].cat.codes
dataset["degree_t"] = dataset["degree_t"].cat.codes
dataset["workex"] = dataset["workex"].cat.codes
dataset["specialisation"] = dataset["specialisation"].cat.codes
dataset["status"] = dataset["status"].cat.codes
dataset["hsc_s"] = dataset["hsc_s"].cat.codes

# display dataset
dataset


Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status
0,1,67.00,1,91.00,1,1,58.00,2,0,55.00,1,58.80,1
1,1,79.33,0,78.33,1,2,77.48,2,1,86.50,0,66.28,1
2,1,65.00,0,68.00,0,0,64.00,0,0,75.00,0,57.80,1
3,1,85.80,0,73.60,0,1,73.30,0,0,96.80,0,55.50,1
4,1,82.00,0,64.00,0,2,66.00,2,1,67.00,0,62.14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0,67.00,0,70.00,0,1,65.00,1,0,88.00,1,71.96,0
211,1,54.20,0,63.00,1,2,58.00,0,0,79.00,1,58.44,0
212,1,41.00,0,42.00,0,2,60.00,0,0,97.00,0,53.39,0
213,0,43.00,0,60.00,1,2,65.00,0,0,92.66,1,62.92,0


### Step 6: Now to split the dataset into features and values using iloc() function:

In [13]:
# selecting the features and labels
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

# display dependent variables
Y


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

### Step 7: Now we will split the dataset into train and test data which will be used to check the efficiency later.

In [14]:
# dividing the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# display dataset
dataset.head()


Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,CGPA,degree_t,workex,etest_p,specialisation,Masters,status
0,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1
1,1,79.33,0,78.33,1,2,77.48,2,1,86.5,0,66.28,1
2,1,65.0,0,68.0,0,0,64.0,0,0,75.0,0,57.8,1
3,1,85.8,0,73.6,0,1,73.3,0,0,96.8,0,55.5,1
4,1,82.0,0,64.0,0,2,66.0,2,1,67.0,0,62.14,1


### Step 8: Now we need to train our model for which we will need to import a file, and then we will create a classifier using sklearn module. Then we will check the accuracy of the model.



In [15]:
# creating a classifier using sklearn
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs',max_iter=1000).fit(X_train, Y_train)
# printing the acc
clf.score(X_test, Y_test)


0.8837209302325582

### Step 9: Once we have trained the model, we will check it giving some random values:



In [16]:
# predicting for random value
clf.predict([[0, 87, 0, 95, 0, 2, 78, 2, 0, 0, 1, 0]])


array([1], dtype=int8)

### Step 10: To gain a more nuanced understanding of our model’s performance we need to make a confusion matrix. 

### A confusion matrix is a table with two rows and two columns that reports the number of false positives, false negatives, true positives, and true negatives.  

### To get the confusion matrix it takes in two arguments: The actual labels of your test set y_test and predicted labels. The predicted labels of the classifier are stored in y_pred as follows:

In [17]:
# creating a Y_pred for test data
Y_pred = clf.predict(X_test)

# display predicted values
Y_pred


array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0],
      dtype=int8)

### Step 11: Finally, we have y_pred, so we can generate the confusion matrix:

In [18]:
# evaluation of the classifier
from sklearn.metrics import confusion_matrix, accuracy_score

# display confusion matrix
print(confusion_matrix(Y_test, Y_pred))

# display accuracy
print(accuracy_score(Y_test, Y_pred))


[[ 8  2]
 [ 3 30]]
0.8837209302325582
