# Create linear regression model

## Import Modules

In [39]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Initialize and load dataset

In [2]:
admission_df = pd.read_csv("../dataset/admission_dataset.csv")

## Show dataset in table

In [3]:
admission_df.drop(columns=["serial_no"], inplace=True)

In [4]:
admission_df

Unnamed: 0,gre_score,toefl_score,university_rating,sop,lor,cgpa,research,chance_of_admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


## Select independent and dependent feature

In [5]:
X = admission_df[[ "gre_score", "toefl_score", "university_rating", "sop", "lor", "cgpa", "research"]]
y = admission_df["chance_of_admit"]

## Split dataframe in train, test, and validation dataset

### Train, test, and validation

In [6]:
admission_df

Unnamed: 0,gre_score,toefl_score,university_rating,sop,lor,cgpa,research,chance_of_admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


In [7]:
train_df      = admission_df.sample(n=300, replace=False, random_state=43)
test_df       = admission_df[~admission_df.isin(train_df)].dropna(axis=0).sample(n=100, replace=False, random_state=43)
test_temp_df  = admission_df[~admission_df.isin(train_df)].dropna(axis=0)
validation_df = test_temp_df[~test_temp_df.isin(test_df)].dropna(axis=0)

In [8]:
X_train = train_df.drop(columns=["chance_of_admit"])
y_train = train_df.drop(columns=["gre_score" ,"toefl_score", "university_rating", "sop", "lor", "cgpa", "research"])

X_test = test_df.drop(columns=["chance_of_admit"])
y_test = test_df.drop(columns=["gre_score" ,"toefl_score", "university_rating", "sop", "lor", "cgpa", "research"])

X_validation = validation_df.drop(columns=["chance_of_admit"])
y_validation = validation_df.drop(columns=["gre_score" ,"toefl_score", "university_rating", "sop", "lor", "cgpa", "research"])

In [9]:
print("Data train:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape, "\n")

print("Data testing")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape, "\n")

print("Data validation")
print("X_validation:", X_validation.shape)
print("y_validation:", y_validation.shape)

Data train:
X_train: (300, 7)
y_train: (300, 1) 

Data testing
X_test: (100, 7)
y_test: (100, 1) 

Data validation
X_validation: (100, 7)
y_validation: (100, 1)


## Create model

In [35]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

LinearRegression()

## Intercept

In [36]:
print("Intercept:", linear_regression_model.intercept_[0])

Intercept: -1.1343945601412755


## Coefficient

In [37]:
gre_score_coef         = linear_regression_model.coef_[0][0]
toefl_score_coef       = linear_regression_model.coef_[0][1]
university_rating_coef = linear_regression_model.coef_[0][2]
sop_coef               = linear_regression_model.coef_[0][3]
lor_coef               = linear_regression_model.coef_[0][4]
cgpa_coef              = linear_regression_model.coef_[0][5]
research_coef          = linear_regression_model.coef_[0][6]

print("GRE Score Coefficient        :", gre_score_coef)
print("Toefl Score Coefficient      :", toefl_score_coef)
print("University Rating Coefficient:", university_rating_coef)
print("SOP Coefficeint              :", sop_coef)
print("LOR Coefficient              :", lor_coef)
print("CGPA Coefficient             :", cgpa_coef)
print("Research Coefficient         :", research_coef)

GRE Score Coefficient        : 0.0013909806428800532
Toefl Score Coefficient      : 0.003327970658963705
University Rating Coefficient: 0.009852225902771718
SOP Coefficeint              : 0.0067705830841448145
LOR Coefficient              : 0.012791993291725867
CGPA Coefficient             : 0.10976720843125265
Research Coefficient         : 0.02986877707023629


## Check accuracy with R-Squared

In [38]:
linear_regression_model.score(X_test, y_test)

0.8373311696294835

## Predict with model

In [32]:
pd.merge(X_validation, y_validation, left_index=True, right_index=True)

Unnamed: 0,gre_score,toefl_score,university_rating,sop,lor,cgpa,research,chance_of_admit
14,311.0,104.0,3.0,3.5,2.0,8.20,1.0,0.61
16,317.0,107.0,3.0,4.0,3.0,8.70,0.0,0.66
21,325.0,114.0,4.0,3.0,2.0,8.40,0.0,0.70
23,334.0,119.0,5.0,5.0,4.5,9.70,1.0,0.95
28,295.0,93.0,1.0,2.0,2.0,7.20,0.0,0.46
...,...,...,...,...,...,...,...,...
481,323.0,107.0,4.0,3.0,2.5,8.48,1.0,0.78
485,311.0,101.0,2.0,2.5,3.5,8.34,1.0,0.70
492,298.0,101.0,4.0,2.5,4.5,7.69,1.0,0.53
493,300.0,95.0,2.0,3.0,1.5,8.22,1.0,0.62


In [34]:
gre_score         = 300
toefl_score       = 95
university_rating = 2
sop               = 3
lor               = 1.5
cgpa              = 8.22
research          = 1

prediction = linear_regression_model.predict([[gre_score, toefl_score, university_rating, sop, lor, cgpa, research]])

print("Chance of admit prediction:", round(prediction[0][0], 2))

Chance of admit prediction: 0.59


## Accuracy and train sample

In [18]:
trains_size = []
scores      = []
size        = 0.3

for i in np.arange(0, 0.698, 0.001):
    X_train = train_df.drop(columns=["chance_of_admit"])
    y_train = train_df.drop(columns=["gre_score" ,"toefl_score", "university_rating", "sop", "lor", "cgpa", "research"])

    X_test = test_df.drop(columns=["chance_of_admit"])
    y_test = test_df.drop(columns=["gre_score" ,"toefl_score", "university_rating", "sop", "lor", "cgpa", "research"])
    
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    
    trains_size.append(round(((size + i) * 100), 2))
    scores.append(round(linear_regression_model.score(X_test, y_test) * 100, 2))

## Visualization R-Squared score and size of data train

In [19]:
fig = go.Figure(data=go.Scatter(x=trains_size, y=scores))

fig.update_layout(
    title       = 'Data Train and Accuracy',
    title_x     = 0.5,
    xaxis_title = 'Size data train in Percent',
    yaxis_title = 'R-Squared score in Percent',
    height      = 600
)

fig.show()