# Support Vector Regression on Admission Prediction Dataset

Pipeline ML Model

1. Data Ingestion
2. EDA
3. Preprocessinga
4. Model Creation - SVM
5. Evalution

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/srinivasav22/Graduate-Admission-Prediction/master/Admission_Predict_Ver1.1.csv")
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
data.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [5]:
data.columns = [i.strip() for i in data.columns]

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB


In [7]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Serial No.,500.0,250.5,144.481833,1.0,125.75,250.5,375.25,500.0
GRE Score,500.0,316.472,11.295148,290.0,308.0,317.0,325.0,340.0
TOEFL Score,500.0,107.192,6.081868,92.0,103.0,107.0,112.0,120.0
University Rating,500.0,3.114,1.143512,1.0,2.0,3.0,4.0,5.0
SOP,500.0,3.374,0.991004,1.0,2.5,3.5,4.0,5.0
LOR,500.0,3.484,0.92545,1.0,3.0,3.5,4.0,5.0
CGPA,500.0,8.57644,0.604813,6.8,8.1275,8.56,9.04,9.92
Research,500.0,0.56,0.496884,0.0,0.0,1.0,1.0,1.0
Chance of Admit,500.0,0.72174,0.14114,0.34,0.63,0.72,0.82,0.97


### No Null values

In [8]:
data.shape

(500, 9)

### Independent and Dependent variables

In [9]:
X = data.drop(columns=['Serial No.','Chance of Admit'])
y = data['Chance of Admit']
X.shape, y.shape

((500, 7), (500,))

### train test split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((335, 7), (165, 7), (335,), (165,))

### standardizing

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_tf = scaler.fit_transform(X_train)
X_test_tf = scaler.transform(X_test)

### Model Building

In [12]:
from sklearn.svm import SVR
svr_model = SVR(kernel='linear',C=1, epsilon=0.1)

In [13]:
svr_model.fit(X_train_tf,y_train)

SVR(C=1, kernel='linear')

In [14]:
svr_model.coef_

array([[ 0.01817645,  0.03057983, -0.00956225,  0.00208074,  0.01325688,
         0.07227863,  0.00951307]])

In [15]:
svr_model.score(X_train_tf, y_train)

0.789512540982556

In [16]:
svr_model.score(X_test_tf, y_test)

0.801651590918972

In [17]:
y_pred = svr_model.predict(X_test_tf)

In [18]:
y_pred

array([0.89591386, 0.76988399, 0.55754925, 0.70050636, 0.80290605,
       0.83294886, 0.48019234, 0.63106342, 0.80969689, 0.78193434,
       0.70377118, 0.71671732, 0.60896584, 0.89693011, 0.81031648,
       0.49608471, 0.78895505, 0.57598176, 0.52676777, 0.54252879,
       0.65056146, 0.51701619, 0.69432417, 0.76968339, 0.76324498,
       0.5826897 , 0.92601906, 0.81634099, 0.61814826, 0.71396412,
       0.53556364, 0.72215046, 0.53977717, 0.83915731, 0.62422689,
       0.71514522, 0.55107543, 0.93011403, 0.6222392 , 0.70184783,
       0.93880299, 0.55078901, 0.64965707, 0.83218066, 0.90201561,
       0.56877785, 0.93970352, 0.80768806, 0.75211738, 0.89637992,
       0.85362485, 0.5504776 , 0.66974638, 0.51014998, 0.92226993,
       0.56286238, 0.93592456, 0.73096709, 0.65433983, 0.47666195,
       0.60670611, 0.66431841, 0.59063348, 0.55079561, 0.42732362,
       0.58394377, 0.84429809, 0.86151586, 0.63815736, 0.68843686,
       0.59826918, 0.76465462, 0.6723682 , 0.55250915, 0.55277

In [19]:
svr_model.score(X_test_tf,y_test)

0.801651590918972

### r2 square

In [26]:
from sklearn.metrics import r2_score
r2_s = r2_score(y_test, y_pred)
r2_s

0.801651590918972

### Adjusted r2 square

In [27]:
1 - ((1-r2_s) * (X_test_tf.shape[0] - 1))/ (X_test_tf.shape[0] - X_test_tf.shape[1] - 1)

0.7928080312784166

# Hyperparameter Tunning

In [59]:
from sklearn.model_selection import GridSearchCV

svr = SVR()
parameter = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'C':[1,2,5,10], 'epsilon':[0.1,0.5]}
svr_reg = GridSearchCV(estimator=svr,param_grid=parameter)
svr_reg.fit(X_train_tf,y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [1, 2, 5, 10], 'epsilon': [0.1, 0.5],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [60]:
print(svr_reg.best_params_)
print(svr_reg.best_score_)

{'C': 2, 'epsilon': 0.1, 'kernel': 'linear'}
0.7760949202922166
