In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/srinivasav22/Graduate-Admission-Prediction/master/Admission_Predict_Ver1.1.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
df.shape

(500, 9)

In [4]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

#### 500 Rows and 9 columns are in dataset,  'Chance of Admit ' is target dataset

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB


#### No String columns in dataset

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Serial No.,500.0,250.5,144.481833,1.0,125.75,250.5,375.25,500.0
GRE Score,500.0,316.472,11.295148,290.0,308.0,317.0,325.0,340.0
TOEFL Score,500.0,107.192,6.081868,92.0,103.0,107.0,112.0,120.0
University Rating,500.0,3.114,1.143512,1.0,2.0,3.0,4.0,5.0
SOP,500.0,3.374,0.991004,1.0,2.5,3.5,4.0,5.0
LOR,500.0,3.484,0.92545,1.0,3.0,3.5,4.0,5.0
CGPA,500.0,8.57644,0.604813,6.8,8.1275,8.56,9.04,9.92
Research,500.0,0.56,0.496884,0.0,0.0,1.0,1.0,1.0
Chance of Admit,500.0,0.72174,0.14114,0.34,0.63,0.72,0.82,0.97


In [7]:
df.isna().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [8]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

#### No Null/NA Values in datasets

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(df.corr(), cmap="CMRmap", annot=True)
plt.show()

#### CGPA and Chance of admit is 88% related to eachother
#### GRE Score and Chance of admit is 81% related to eachother


In [None]:
sns.barplot(x=df['CGPA'], y=df['Chance of Admit '])

#### More CGPA MORE chance of admission

In [None]:
sns.barplot(x=df['GRE Score'], y=df['Chance of Admit '])

#### More GRE Score, MORE chance of admission

In [None]:
plt.figure(figsize=(15, 15))
plt.suptitle('Univariate Analysis of all Features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)

for i in range(0, len(df.columns)):
    plt.subplot(5, 3, i+1)
    sns.barplot(x=df[df.columns[i]], y=df['Chance of Admit '])
    plt.xlabel(df.columns[i])
    plt.tight_layout()

### High number in  'GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research' is hight chance of admission

In [None]:
plt.figure(figsize=(15, 15))
plt.suptitle('Univariate Analysis of all Features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)

for i in range(0, len(df.columns)):
    plt.subplot(5, 3, i+1)
    sns.kdeplot(x=df[df.columns[i]],shade=True, color='b')
    plt.xlabel(df.columns[i])
    plt.tight_layout()

#### No More skewed data available

In [None]:
plt.figure(figsize=(15, 15))
plt.suptitle('Boxplot Analysis of Features', fontsize=20, fontweight='bold', alpha=0.9, y=1.)
sns.boxplot(data=df)

#### No Outliers available in dataset

In [None]:
x=df.drop("Chance of Admit ",axis=1)
y=df['Chance of Admit ']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)

In [None]:
x_train_tf=scaler.transform(x_train)

#### Transformed Training data

In [None]:
model=svm.SVR()

In [None]:
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[1, 12], 'degree':[1,10], 'coef0':[0,10], 'epsilon':[0.1,1.5]}

In [None]:
clf = GridSearchCV(model, parameters)
clf.fit(x_train_tf,y_train)

In [None]:
clf.best_params_

In [None]:
model_svr=svm.SVR(C=12, coef0=0, degree=1, epsilon=0.1, kernel='poly')
model_svr.fit(x_train_tf,y_train)

#### Trained model with best parameter C=12, coef0=0, degree=1, kernel='poly'

In [None]:
model_svr.score(x_train_tf,y_train)

### Training score is 81%

In [None]:
x_test_tf=scaler.transform(x_test)
y_predict=model_svr.predict(x_test_tf)

In [None]:
y_predict

In [None]:
r2_score(y_test,y_predict)

### Test Score is 80%