# **Linear Regression**

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load dataset
df = pd.read_csv('regression_data.csv')

df.head()

Unnamed: 0,gre_score,toefl_score,univ_ranking,motiv_letter_strength,recommendation_strength,gpa,research_exp,admit_prob
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gre_score                500 non-null    int64  
 1   toefl_score              500 non-null    int64  
 2   univ_ranking             500 non-null    int64  
 3   motiv_letter_strength    500 non-null    float64
 4   recommendation_strength  500 non-null    float64
 5   gpa                      500 non-null    float64
 6   research_exp             500 non-null    int64  
 7   admit_prob               500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB


In [6]:
# Missing values
df.isna().sum()

gre_score                  0
toefl_score                0
univ_ranking               0
motiv_letter_strength      0
recommendation_strength    0
gpa                        0
research_exp               0
admit_prob                 0
dtype: int64

In [9]:
# duplicate
df.duplicated().sum()

0

In [10]:
# Define features and target
X = df.drop(columns='admit_prob')
y = df['admit_prob']

In [12]:
# Data splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# define model
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()


In [15]:
# Train the model
linreg.fit(X_train, y_train)

In [16]:
# Retrieve the coefficients

data = X_train
model = linreg

coef_df = pd.DataFrame({
    'feature':['intercept'] + data.columns.tolist(),
    'coefficient':[model.intercept_] + list(model.coef_)
})

coef_df

Unnamed: 0,feature,coefficient
0,intercept,-1.421447
1,gre_score,0.002434
2,toefl_score,0.002996
3,univ_ranking,0.002569
4,motiv_letter_strength,0.001814
5,recommendation_strength,0.017238
6,gpa,0.112527
7,research_exp,0.024027


In [17]:
# Model evaluation on training data
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Predict
y_pred_train = linreg.predict(X_train)

print('MAE for Training Data', mean_absolute_error(y_train, y_pred_train))
print('MAPE for Training Data',mean_absolute_percentage_error(y_train, y_pred_train))

MAE for Training Data 0.04253334061164315
MAPE for Training Data 0.06848166838244782


In [18]:
# Predict
y_pred_test = linreg.predict(X_test)

print('MAE for Training Data', mean_absolute_error(y_test, y_pred_test))
print('MAPE for Training Data',mean_absolute_percentage_error(y_test, y_pred_test))

MAE for Training Data 0.04272265427705369
MAPE for Training Data 0.06857756648317821
