In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
import pickle

In [3]:
# Loading a dataset file

df = pd.read_csv("Admission_Prediction.csv")
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [4]:
# Generating Profile Report of a dataset

pf = ProfileReport(df)

In [5]:
pf.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [12]:
# Handling missing values
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [15]:
# Replacing missing value of columns with mean of the column values

df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].mean())  
df['TOEFL Score'] = df['TOEFL Score'].fillna(df['TOEFL Score'].mean()) 
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].mean()) 

In [16]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.558763,107.187755,3.121649,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.103952,6.051338,1.128802,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,309.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,316.558763,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,324.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [17]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [32]:
# Dropping unnecessary colums
df.drop(columns = ['Serial No.'], inplace = True)
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [34]:
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [38]:
# Declaration of label and features

y = df['Chance of Admit']  # Label
x = df.drop(columns = ['Chance of Admit'])  # Features

In [39]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


In [40]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [42]:
# Normalization or standarization of feature dataset

scaler = StandardScaler()
arr = scaler.fit_transform(x)
arr

array([[ 1.84274116e+00,  1.78854223e+00,  7.78905651e-01, ...,
         1.09894429e+00,  1.77680627e+00,  8.86405260e-01],
       [ 6.70814288e-01, -3.10581135e-02,  7.78905651e-01, ...,
         1.09894429e+00,  4.85859428e-01,  8.86405260e-01],
       [ 5.12433309e-15, -5.27312752e-01, -1.07876604e-01, ...,
         1.73062093e-02, -9.54042814e-01,  8.86405260e-01],
       ...,
       [ 1.21170361e+00,  2.11937866e+00,  1.66568791e+00, ...,
         1.63976333e+00,  1.62785086e+00,  8.86405260e-01],
       [-4.10964364e-01, -6.92730965e-01,  7.78905651e-01, ...,
         1.63976333e+00, -2.42366993e-01, -1.12815215e+00],
       [ 9.41258951e-01,  9.61451165e-01,  7.78905651e-01, ...,
         1.09894429e+00,  7.67219636e-01, -1.12815215e+00]])

In [44]:
df1 = pd.DataFrame(arr)
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [45]:
df1.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [46]:
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [47]:
df1.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,4.35052e-15,9.419132e-16,5.608847e-16,2.926548e-16,-1.3322680000000001e-17,3.091971e-15,-2.202682e-16
std,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002
min,-2.394225,-2.512331,-1.881441,-2.39795,-2.686789,-2.940115,-1.128152
25%,-0.681409,-0.692731,-0.9946589,-0.8828175,-0.5235128,-0.7430227,-1.128152
50%,5.124333e-15,-0.03105811,-0.1078766,0.1272712,0.01730621,-0.02720919,0.8864053
75%,0.6708143,0.796033,0.7789057,0.6323155,0.5581253,0.7672196,0.8864053
max,2.113186,2.119379,1.665688,1.642404,1.639763,2.223672,0.8864053


In [62]:
# Check the multi-collinearity of the dataset

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame() # Generating new dataframe

In [63]:
arr.shape[1]

7

In [64]:
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]
vif_df['feature'] = x.columns

In [66]:
vif_df # We can not drop any column as all vif scores are less than 10, so there is no multi-collinearity

Unnamed: 0,vif,feature
0,4.153268,GRE Score
1,3.792866,TOEFL Score
2,2.508768,University Rating
3,2.77575,SOP
4,2.037308,LOR
5,4.65167,CGPA
6,1.459311,Research


In [129]:
# Dividing our data set into tarin and test datasets

x_train, x_test, y_train, y_test = train_test_split(arr, y, test_size = 0.15, random_state = 100)

In [130]:
x_tarin

array([[-4.10964364e-01, -6.92730965e-01,  7.78905651e-01, ...,
         1.63976333e+00, -2.42366993e-01, -1.12815215e+00],
       [-1.67303946e+00, -1.02356739e+00,  7.78905651e-01, ...,
         1.09894429e+00, -1.46711143e+00,  8.86405260e-01],
       [-4.10964364e-01,  1.34360100e-01, -1.07876604e-01, ...,
        -5.23512832e-01, -7.68609886e-02, -1.12815215e+00],
       ...,
       [ 4.90517846e-01,  1.29228759e+00,  1.66568791e+00, ...,
         1.09894429e+00,  1.29683885e+00,  8.86405260e-01],
       [-1.67303946e+00, -1.68524024e+00,  3.93810431e-16, ...,
        -5.23512832e-01, -2.26154025e+00, -1.12815215e+00],
       [ 1.75259294e+00,  1.95396044e+00,  1.66568791e+00, ...,
         1.73062093e-02,  2.02506527e+00,  8.86405260e-01]])

In [131]:
# Calling Linear Regression algorithm to build a model

lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

In [132]:
pickle.dump(lr, open('admission_lr_modle.pickle', 'wb')) #Creating phsical model

In [133]:
# Lets do some prediction

lr.predict([[324.000000,107.0,4.0,4.0,4.5,8.87,1]])

array([9.58224162])

In [134]:
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [135]:
test1 = scaler.transform([[324.000000,107.0,4.0,4.0,4.5,8.87,1]])
test1

array([[ 0.67081429, -0.03105811,  0.77890565,  0.63231549,  1.09894429,
         0.48585943,  0.88640526]])

In [136]:
# Lets do some prediction

lr.predict([[0.67081429, -0.03105811,  0.77890565,  0.63231549,  1.09894429,
         0.48585943,  0.88640526]])

array([0.80284701])

In [137]:
model = pickle.load(open('admission_lr_modle.pickle', 'rb'))

In [138]:
model.predict(test1)

array([0.80284701])

In [139]:
# To check the accuracy or R^2 pf the model

lr.score(x_test, y_test)

0.8420039560601401

In [140]:
# Lets create a fuction to calculate the adjusted R^2

def adj_r2(x,y):
    r2 = lr.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [141]:
adj_r2(x_test,y_test)

0.8254969066932891

In [142]:
lr.coef_

array([ 0.01912905,  0.01780082,  0.00550634, -0.00025051,  0.01844312,
        0.07254151,  0.01195331])

In [143]:
lr.intercept_

0.7203289055688045

In [158]:
lassocv = LassoCV(alphas = None, cv = 5, max_iter = 20000000, normalize = True)
lassocv.fit(x_train,y_train)

LassoCV(cv=5, max_iter=20000000, normalize=True)

In [159]:
lassocv.alpha_

4.203962663551952e-05

In [160]:
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(x_train,y_train)

Lasso(alpha=4.203962663551952e-05)

In [161]:
lasso.score(x_test,y_test)

0.8421260048013872

In [168]:
ridgecv = RidgeCV(alphas = np.random.uniform(0,10,50), cv = 10, normalize = True)
ridgecv.fit(x_train, y_train)

RidgeCV(alphas=array([6.46464554, 2.5388597 , 4.5334053 , 9.62001007, 8.08204469,
       1.5592767 , 3.0267023 , 1.99459446, 7.56440811, 9.05177626,
       9.34791784, 6.53276939, 9.43421216, 1.16597285, 8.7631279 ,
       0.66647762, 0.65316992, 2.63759263, 1.95757612, 8.0835134 ,
       8.6517979 , 7.47197518, 1.49126849, 4.4219638 , 4.49799109,
       0.34371961, 4.26194961, 6.94309859, 9.03344178, 9.19980651,
       8.10772356, 2.74103742, 8.53844881, 2.29456717, 5.3286391 ,
       5.72677983, 2.79587713, 8.21336663, 9.95723901, 1.78559743,
       6.45307109, 8.17712815, 9.0599209 , 6.12846896, 5.48546144,
       9.02235998, 2.22450713, 2.08821569, 2.43122313, 8.10638338]),
        cv=10, normalize=True)

In [169]:
ridgecv.alpha_

0.34371961169519016

In [167]:
np.random.uniform(0,10,50)

array([3.92184621, 8.6814462 , 8.35332977, 4.31418089, 4.09190863,
       0.68784296, 0.48771704, 4.34427248, 9.40453696, 5.16708164,
       7.12490021, 7.96328293, 9.75322002, 1.35147992, 2.73990073,
       6.31733187, 8.7337867 , 2.87187964, 3.02920113, 2.43143563,
       5.28664758, 4.19710071, 4.08567687, 9.62256968, 3.83431168,
       4.59225988, 8.0929157 , 2.24923292, 6.12275325, 2.12741327,
       2.99761567, 0.93097996, 1.61835238, 4.1828076 , 2.2345452 ,
       9.84440142, 3.05718433, 7.77386157, 3.39487578, 6.37705867,
       3.6880106 , 5.30290921, 0.1966316 , 3.48564427, 8.8115057 ,
       8.11637525, 0.63596567, 3.65099097, 1.64796301, 5.14316528])

In [170]:
ridge_lr = Ridge(ridgecv.alpha_)
ridge_lr.fit(x_train, y_train)

Ridge(alpha=0.34371961169519016)

In [173]:
ridge_lr.score(x_test,y_test)

0.8420092204602514

In [174]:
elastic = ElasticNetCV(alphas = None, cv = 10)
elastic.fit(x_train,y_train)

ElasticNetCV(cv=10)

In [175]:
elastic.alpha_

0.001391101145529104

In [176]:
elastic.l1_ratio_

0.5

In [177]:
elastic_lr = ElasticNet(alpha = elastic.alpha_, l1_ratio=elastic.l1_ratio_)

In [178]:
elastic_lr.fit(x_train, y_train)

ElasticNet(alpha=0.001391101145529104)

In [179]:
elastic_lr.score(x_test, y_test)

0.8419586493164081