In [1]:
#!pip install pandas==1.3.5

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "iframe"

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
pd.__version__

'2.2.3'

In [4]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Collecting tqdm (from kagglehub)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, kagglehub
Successfully installed kagglehub-0.3.10 tqdm-4.67.1


In [10]:
# download the dataset using kagglehub
import kagglehub

# Download latest version
datasetPath = kagglehub.dataset_download("tejashvi14/engineering-placements-prediction")

csvFilePath = f"{datasetPath}/collegePlace.csv"
# print("Path to dataset files:", path)

In [26]:
df = pd.read_csv(csvFilePath)


In [27]:
df.shape

(2966, 8)

In [28]:
df.head()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,Male,Electronics And Communication,1,8,1,1,1
1,21,Female,Computer Science,0,7,1,1,1
2,22,Female,Information Technology,1,6,0,0,1
3,21,Male,Information Technology,0,8,0,1,1
4,22,Male,Mechanical,0,8,1,0,1


In [29]:
df.tail()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
2961,23,Male,Information Technology,0,7,0,0,0
2962,23,Male,Mechanical,1,7,1,0,0
2963,22,Male,Information Technology,1,7,0,0,0
2964,22,Male,Computer Science,1,7,0,0,0
2965,23,Male,Civil,0,8,0,0,1


In [30]:
df.sample(4)

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
492,21,Male,Information Technology,1,8,0,0,1
1658,22,Male,Electronics And Communication,1,8,0,0,1
2706,20,Male,Mechanical,0,7,1,1,0
1302,24,Male,Information Technology,1,6,0,1,1


In [31]:
df.dtypes

Age                   int64
Gender               object
Stream               object
Internships           int64
CGPA                  int64
Hostel                int64
HistoryOfBacklogs     int64
PlacedOrNot           int64
dtype: object

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                2966 non-null   int64 
 1   Gender             2966 non-null   object
 2   Stream             2966 non-null   object
 3   Internships        2966 non-null   int64 
 4   CGPA               2966 non-null   int64 
 5   Hostel             2966 non-null   int64 
 6   HistoryOfBacklogs  2966 non-null   int64 
 7   PlacedOrNot        2966 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 185.5+ KB


In [33]:
df.describe()

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
count,2966.0,2966.0,2966.0,2966.0,2966.0,2966.0
mean,21.48584,0.703641,7.073837,0.269049,0.192178,0.552596
std,1.324933,0.740197,0.967748,0.44354,0.394079,0.49731
min,19.0,0.0,5.0,0.0,0.0,0.0
25%,21.0,0.0,6.0,0.0,0.0,0.0
50%,21.0,1.0,7.0,0.0,0.0,1.0
75%,22.0,1.0,8.0,1.0,0.0,1.0
max,30.0,3.0,9.0,1.0,1.0,1.0


In [35]:
# Only works if you use pandas version 1.3.5
#df.corr()['PlacedOrNot']

In [62]:
nsDf = df.drop( ['Stream','Gender'], axis=1)

In [63]:
nsDf

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,1,8,1,1,1
1,21,0,7,1,1,1
2,22,1,6,0,0,1
3,21,0,8,0,1,1
4,22,0,8,1,0,1
...,...,...,...,...,...,...
2961,23,0,7,0,0,0
2962,23,1,7,1,0,0
2963,22,1,7,0,0,0
2964,22,1,7,0,0,0


In [64]:
nsDf.columns

Index(['Age', 'Internships', 'CGPA', 'Hostel', 'HistoryOfBacklogs',
       'PlacedOrNot'],
      dtype='object')

In [65]:
nsDf.dtypes

Age                  int64
Internships          int64
CGPA                 int64
Hostel               int64
HistoryOfBacklogs    int64
PlacedOrNot          int64
dtype: object

In [66]:
nsDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                2966 non-null   int64
 1   Internships        2966 non-null   int64
 2   CGPA               2966 non-null   int64
 3   Hostel             2966 non-null   int64
 4   HistoryOfBacklogs  2966 non-null   int64
 5   PlacedOrNot        2966 non-null   int64
dtypes: int64(6)
memory usage: 139.2 KB


In [67]:
nsDf.describe()

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
count,2966.0,2966.0,2966.0,2966.0,2966.0,2966.0
mean,21.48584,0.703641,7.073837,0.269049,0.192178,0.552596
std,1.324933,0.740197,0.967748,0.44354,0.394079,0.49731
min,19.0,0.0,5.0,0.0,0.0,0.0
25%,21.0,0.0,6.0,0.0,0.0,0.0
50%,21.0,1.0,7.0,0.0,0.0,1.0
75%,22.0,1.0,8.0,1.0,0.0,1.0
max,30.0,3.0,9.0,1.0,1.0,1.0


- How is the correlation between features(other Colms) and target col

In [68]:
nsDf.corr()['PlacedOrNot']

Age                  0.046943
Internships          0.179334
CGPA                 0.588648
Hostel              -0.038182
HistoryOfBacklogs   -0.022337
PlacedOrNot          1.000000
Name: PlacedOrNot, dtype: float64


# Preprocessing + EDA + Feature Selection
## Preprocessing
### Basic Preprocessing:
1. Check if the dataframe has any null values, handle if present.
2. Check if there is any duplicate row in the datafame, handle if present.
1. Are there any missing values?

In [69]:
nsDf.isnull().sum()

Age                  0
Internships          0
CGPA                 0
Hostel               0
HistoryOfBacklogs    0
PlacedOrNot          0
dtype: int64

In [71]:
# check for duplicate values
print( nsDf.duplicated().sum())

nsDf.drop_duplicates(inplace=True)

2541


In [72]:
# check if  duplicate values are removed
print( nsDf.duplicated().sum())

0



# EDA
1. Plot the graph to visualize the output wrt 2 major features

In [84]:
fig  = px.scatter(nsDf,x='CGPA', y='Internships', color="PlacedOrNot", hover_data=['CGPA'])

In [85]:
fig.show()

### Plot the count of placed and not placed Histogram

In [79]:
fig = px.histogram(nsDf, x='PlacedOrNot', color='PlacedOrNot', barmode='group')
fig.show()

In [98]:
fig = px.pie(nsDf, values=nsDf['PlacedOrNot'].value_counts().values, names=['Placed','Not placed'],title='Placed Vs Not Placed')

In [99]:
fig.show()

In [95]:
nsDf['PlacedOrNot'].value_counts().index.values

array([1, 0])

In [107]:
print("max age of Placed person: ", nsDf[(nsDf['Age'] == nsDf['Age'].max()) & (nsDf['PlacedOrNot']==1)]['Age'].values[0] )
print("Min Age of Placed Person: ",nsDf[(nsDf['Age'] == nsDf['Age'].min()) & (nsDf['PlacedOrNot']==1)]['Age'].values[0])

max age of Placed person:  30
Min Age of Placed Person:  19


In [110]:
print("Max Internships Done by the Placed Student: ",df[(df['Internships'] == df['Internships'].max()) & (df['PlacedOrNot']==1)]['Internships'].values[0])
print("No of students who did max Internships and are placed: ",df[(df['Internships'] == df['Internships'].max()) & (df['PlacedOrNot']==1)]['Internships'].value_counts().values[0])

print("Min Internships Done by the Placed Person: ",df[(df['Internships'] == df['Internships'].min()) & (df['PlacedOrNot']==1)]['Internships'].values[0])
print("No of students who did min Internships and are placed: ",df[(df['Internships'] == df['Internships'].min()) & (df['PlacedOrNot']==1)]['Internships'].value_counts().values[0])


Max Internships Done by the Placed Student:  3
No of students who did max Internships and are placed:  41
Min Internships Done by the Placed Person:  0
No of students who did min Internships and are placed:  654


In [111]:
print("Max CGPA of Placed Student: ",df[(df['CGPA'] == df['CGPA'].max()) & (df['PlacedOrNot']==1)]['CGPA'].values[0])
print("No of students has max CGPA and are placed: ",df[(df['CGPA'] == df['CGPA'].max()) & (df['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])

print("Min CGPA of Placed Person: ",df[(df['CGPA'] == df['CGPA'].min()) & (df['PlacedOrNot']==1)]['CGPA'].values[0])
print("No of students has min CGPA and are placed: ",df[(df['CGPA'] == df['CGPA'].min()) & (df['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])


Max CGPA of Placed Student:  9
No of students has max CGPA and are placed:  165
Min CGPA of Placed Person:  5
No of students has min CGPA and are placed:  7


In [112]:
fig = px.box(df, y='CGPA')
fig.show()


In [113]:
fig = px.box(df, y='Age')
fig.show()

In [114]:


fig = px.box(df, y=['Internships','CGPA', 'Age'])
fig.show()



In [115]:


# convert Gender column to numeric
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})



In [116]:
df

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,1,Electronics And Communication,1,8,1,1,1
1,21,0,Computer Science,0,7,1,1,1
2,22,0,Information Technology,1,6,0,0,1
3,21,1,Information Technology,0,8,0,1,1
4,22,1,Mechanical,0,8,1,0,1
...,...,...,...,...,...,...,...,...
2961,23,1,Information Technology,0,7,0,0,0
2962,23,1,Mechanical,1,7,1,0,0
2963,22,1,Information Technology,1,7,0,0,0
2964,22,1,Computer Science,1,7,0,0,0


In [117]:
df['Stream'].unique()

array(['Electronics And Communication', 'Computer Science',
       'Information Technology', 'Mechanical', 'Electrical', 'Civil'],
      dtype=object)

In [118]:


# convert Stream column to numeric
df['Stream'] = df['Stream'].map({'Electronics And Communication': 1, 
                                 'Computer Science': 2,
                                'Information Technology': 3,
                                'Mechanical':4,
                                'Electrical':5,
                                'Civil':6})



In [119]:
df.sample(5)

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
2273,22,0,1,1,7,1,0,0
227,21,1,2,2,7,0,0,1
1917,22,1,1,1,7,0,0,0
2174,21,1,2,1,6,0,0,1
1687,22,1,3,0,6,0,0,0


In [121]:


# # you can reduce the features using PCA
# pca = PCA(n_components=2)
# X_pca = pca.fit_transform(X)
# X_pca_transform = pd.DataFrame(data=X_pca)
# #Plot the graph 
# plt.scatter(X_pca_transform[0],X_pca_transform[1],c=y)



In [122]:


X = df.iloc[:,0:7]
y = df.iloc[:,-1]
X



Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs
0,22,1,1,1,8,1,1
1,21,0,2,0,7,1,1
2,22,0,3,1,6,0,0
3,21,1,3,0,8,0,1
4,22,1,4,0,8,1,0
...,...,...,...,...,...,...,...
2961,23,1,3,0,7,0,0
2962,23,1,4,1,7,1,0
2963,22,1,3,1,7,0,0
2964,22,1,2,1,7,0,0


In [123]:
X = df.iloc[:,0:7]
y = df.iloc[:,-1]
X


Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs
0,22,1,1,1,8,1,1
1,21,0,2,0,7,1,1
2,22,0,3,1,6,0,0
3,21,1,3,0,8,0,1
4,22,1,4,0,8,1,0
...,...,...,...,...,...,...,...
2961,23,1,3,0,7,0,0
2962,23,1,4,1,7,1,0
2963,22,1,3,1,7,0,0
2964,22,1,2,1,7,0,0


In [124]:
print(X.shape)
print(y.shape)

(2966, 7)
(2966,)


In [125]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

In [126]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1987, 7)
(979, 7)
(1987,)
(979,)


In [127]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


In [128]:
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#using Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

# Without Scaling 
clf.fit(X_train,y_train) 
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

#scaling has not much effect

Without Scaling and CV:  0.7548518896833504
Without Scaling and With CV:  0.7720090350743618
With Scaling and Without CV:  0.7548518896833504
With Scaling and With CV:  0.7725140855794123


In [129]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
#Using SGD Classifier
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7262512768130746
Without Scaling and With CV:  0.7076265164204862
With Scaling and Without CV:  0.7711950970377937
With Scaling and With CV:  0.7720242627277804


In [130]:
from sklearn.linear_model import Perceptron
# this is same as SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)

clf = Perceptron(tol=1e-3, random_state=0)
# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())


Without Scaling and CV:  0.7854954034729316
Without Scaling and With CV:  0.6416476320998934
With Scaling and Without CV:  0.6966292134831461
With Scaling and With CV:  0.6839703568346784


In [131]:


# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
# Using LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())



Without Scaling and CV:  0.7548518896833504



lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Without Scaling and With CV:  0.7725140855794123
With Scaling and Without CV:  0.7548518896833504
With Scaling and With CV:  0.7720090350743618


In [132]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=10, random_state=0)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.8764044943820225
Without Scaling and With CV:  0.8746865641338003
With Scaling and Without CV:  0.8774259448416751
With Scaling and With CV:  0.873681539008172


In [133]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
from sklearn.svm import SVC
#clf = SVC(gamma='auto')

svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svc, parameters)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())


Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.8001873346581775
Without Scaling and CV:  0.780388151174668
Without Scaling and With CV:  0.7996726054515
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.8646089389010545
With Scaling and Without CV:  0.8508682328907048
With Scaling and With CV:  0.8626084970306076


In [134]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html#sklearn.svm.NuSVC
from sklearn.svm import NuSVC
clf = NuSVC()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7834525025536262
Without Scaling and With CV:  0.8006750926348916
With Scaling and Without CV:  0.8426966292134831
With Scaling and With CV:  0.8590782193797271


In [135]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, tol=1e-5)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7528089887640449
Without Scaling and With CV:  0.7679889345718492
With Scaling and Without CV:  0.7548518896833504
With Scaling and With CV:  0.7699964468808691


In [136]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

#y_pred = gnb.fit(X_train, y_train).predict(X_test)
#print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.8008171603677222
Without Scaling and With CV:  0.808740673062281
With Scaling and Without CV:  0.8008171603677222
With Scaling and With CV:  0.808740673062281


In [137]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.6424923391215526
Without Scaling and With CV:  0.6310923303385615


In [138]:


from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())



Without Scaling and CV:  0.5587334014300307
Without Scaling and With CV:  0.5576265164204862
With Scaling and Without CV:  0.8130745658835546
With Scaling and With CV:  0.8107405715445916


In [140]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
#scores = cross_val_score(clf, X_train, y_train, cv=5)
#print("Without Scaling and With CV: ",scores.mean())

IndexError: index 30 is out of bounds for axis 1 with size 30

In [141]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())


# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.8559754851889684
Without Scaling and With CV:  0.8636135221562358
With Scaling and Without CV:  0.8416751787538305
With Scaling and With CV:  0.8540429419826404


In [142]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("With CV: ",scores.mean())
print("Precision Score: ", precision_score(y_test, y_pred))
print("Recall Score: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))


Without CV:  0.8764044943820225
With CV:  0.8746865641338003
Precision Score:  0.9401197604790419
Recall Score:  0.8380782918149466
F1 Score:  0.8861712135465664


In [143]:


# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
from sklearn.svm import SVC
#clf = SVC(gamma='auto')

svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svc, parameters)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())



Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.8001873346581775
Without Scaling and CV:  0.780388151174668
Without Scaling and With CV:  0.7996726054515
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.8646089389010545
With Scaling and Without CV:  0.8508682328907048
With Scaling and With CV:  0.8626084970306076


In [144]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html#sklearn.svm.NuSVC
from sklearn.svm import NuSVC
clf = NuSVC()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())


Without Scaling and CV:  0.7834525025536262
Without Scaling and With CV:  0.8006750926348916
With Scaling and Without CV:  0.8426966292134831
With Scaling and With CV:  0.8590782193797271


In [145]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, tol=1e-5)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling 
clf.fit(X_train_scale,y_train) 
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())


Without Scaling and CV:  0.7528089887640449
Without Scaling and With CV:  0.7679889345718492
With Scaling and Without CV:  0.7548518896833504
With Scaling and With CV:  0.7699964468808691
