In [134]:
import pandas as pd
import math as math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


## Methodology

We're going to have the response variable as the "Target" column and have Graduate/Enrolled as a success or 1 and have Dropout as a fail or 0.
For our model, we are going to focus on 5 features for our predictors. These features are ['Marital Status', 'isDisplaced', 'Previous qualification', 'isDebtor', 'hasScholarship'].

### Logistic Regression for Dropout/Graduate Students

In [135]:
#df = pd.read_csv("C:/Users/samue_gelsjfg/Downloads/Leena_df1Clean.csv")
df = pd.read_csv("/Users/samuelmai/Downloads/Leena_df1Clean.csv")

In [136]:
df.head()

Unnamed: 0,Marital Status,Course,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Gender,...,Inflation rate,GDP,Target,isDisplaced,hasSpecialNeeds,tuitionToDate,hasScholarship,isDebtor,AttendanceTime,Nationality
0,Single,Animation and Multimedia Design,Secondary,122.0,Basic,Basic,5,9,127.3,Male,...,1.4,1.74,Dropout,True,False,True,False,False,Day,Portuguese
1,Single,Tourism,Secondary,160.0,Secondary,Higher,3,3,142.5,Male,...,-0.3,0.79,Graduate/Enrolled,True,False,False,False,False,Day,Portuguese
2,Single,Communication Design,Secondary,122.0,Basic,Basic,9,9,124.8,Male,...,1.4,1.74,Dropout,True,False,False,False,False,Day,Portuguese
3,Single,Journalism and Communication,Secondary,122.0,Basic,Basic,5,3,119.6,Female,...,-0.8,-3.12,Graduate/Enrolled,True,False,True,False,False,Day,Portuguese
4,Married,Social Service (evening attendance),Secondary,100.0,Basic,Basic,9,9,141.5,Female,...,-0.3,0.79,Graduate/Enrolled,False,False,True,False,False,Evening,Portuguese


In [137]:
df.dtypes

Marital Status                                     object
Course                                             object
Previous qualification                             object
Previous qualification (grade)                    float64
Mother's qualification                             object
Father's qualification                             object
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Gender                                             object
Age at enrollment                                   int64
International                                       int64
Curricular units 1st sem (credited)                 int64
Curricular units 1st sem (enrolled)                 int64
Curricular units 1st sem (evaluations)              int64
Curricular units 1st sem (approved)                 int64
Curricular units 1st sem (grade)                  float64
Curricular uni

In [138]:
print(df['Target'].unique())

# Change 'Graduate/Enrolled' = 1 and 'Dropout' = 0

df['Target'] = df['Target'].replace(['Dropout','Graduate/Enrolled'], ['0','1'])
print(df['Target'].unique())


['Dropout' 'Graduate/Enrolled']
['0' '1']


In [139]:
df['isDisplaced'] = df['isDisplaced'].astype(int)
df['hasSpecialNeeds'] = df['hasSpecialNeeds'].astype(int)
df['tuitionToDate'] = df['tuitionToDate'].astype(int)
df['hasScholarship'] = df['hasScholarship'].astype(int)
df['isDebtor'] = df['isDebtor'].astype(int)

In [140]:
print(df['Previous qualification'].unique())
# This is an ordinal variable, so we can use ordinal encoding and have "Below Secondary = 0, Secondary = 1, and Higher = 2"

encoder = OrdinalEncoder(categories=[["Below secondary", "Secondary", "Higher"]])

# Fit and transform the data
df['Previous qualification'] = encoder.fit_transform(df[['Previous qualification']])
print(df['Previous qualification'].unique())

['Secondary' 'Below secondary' 'Higher']
[1. 0. 2.]


In [141]:
print(df['Marital Status'].unique())

# We will use One Hot Encoding here to turn these values into numerical values

one_hot_encoded = pd.get_dummies(df['Marital Status'])
df = pd.concat([df, one_hot_encoded], axis=1)

# encoder = OneHotEncoder()

# # Fit and transform the data
# encoded_data = encoder.fit_transform(df[['Marital Status']])

# # Convert the encoded data to a DataFrame
# encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Marital Status']))

# # Concatenate the encoded DataFrame with the original DataFrame
# df = pd.concat([df, encoded_df], axis=1)

['Single' 'Married' 'Divorced' 'Widower' 'Facto Union' 'Legally Separated']


In [142]:
print(df["Mother's qualification"].unique())

# Since we don't know for some values, we are going to drop the column as a feature.

['Basic' 'Secondary' 'Higher' 'Unknown']


In [143]:
print(df["Father's qualification"].unique())

# Since we don't know for some values, we are going to drop the column as a feature.

['Basic' 'Higher' 'Secondary' 'Unknown']


In [144]:
print(df['Gender'].unique())

# We will be using binary encoding to have male = 1 and female = 0

df['Gender'] = df['Gender'].replace(['Female','Male'], [0,1])
print(df['Gender'].unique())

['Male' 'Female']
[1 0]


In [145]:
print(df['AttendanceTime'].unique())

df['AttendanceTime'] = df['AttendanceTime'].replace(['Day','Evening'], [0,1])

print(df['AttendanceTime'].unique())

['Day' 'Evening']
[0 1]


In [146]:
print(df['Nationality'].unique())

# One hot encode this

df_one_hot = pd.get_dummies(df['Nationality'])
df = pd.concat([df, df_one_hot], axis=1)


['Portuguese' 'Romanian' 'Spanish' 'Brazilian' 'Santomean' 'Ukrainian'
 'Dutch' 'Mozambican' 'Angolan' 'Mexican' 'Italian' 'Cape Verdean'
 'Turkish' 'Moldova (Republic of)' 'Guinean' 'Colombian' 'German' 'Cuban'
 'Russian' 'English' 'Lithuanian']


In [147]:
df.columns

Index(['Marital Status', 'Course', 'Previous qualification',
       'Previous qualification (grade)', 'Mother's qualification',
       'Father's qualification', 'Mother's occupation', 'Father's occupation',
       'Admission grade', 'Gender', 'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Target', 'isDisplaced', 'hasSpecialNeeds',
       'tuitionToDate', 'hasScholarship', 'isDeb

Now we can run our logistic regression after converting all our categorical variables into numerical values.

In [None]:
X = df.drop(['Target', 'Course', 'Nationality', 'Marital Status', "Mother's qualification", "Father's qualification"], axis=1)
y = df['Target']

In [153]:
X.dtypes

Previous qualification                            float64
Previous qualification (grade)                    float64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Gender                                              int64
Age at enrollment                                   int64
International                                       int64
Curricular units 1st sem (credited)                 int64
Curricular units 1st sem (enrolled)                 int64
Curricular units 1st sem (evaluations)              int64
Curricular units 1st sem (approved)                 int64
Curricular units 1st sem (grade)                  float64
Curricular units 1st sem (without evaluations)      int64
Curricular units 2nd sem (credited)                 int64
Curricular units 2nd sem (enrolled)                 int64
Curricular units 2nd sem (evaluations)              int64
Curricular uni

In [159]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(max_iter = 10000)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
# precision = metrics.precision_score(y_test, y_pred, pos_label='1')
# recall = metrics.recall_score(y_test, y_pred)
# confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'Confusion Matrix:\n{confusion_matrix}')

Accuracy: 0.8655367231638418


### Results

From our accuracy score, it seems that the features used are good at predicting whether someone will be Enrolled or Graduated. However, I fear that this is a case of overfitting so we might have to run it again with fewer features. The one hot encoded variables of Marital Status and Nationality may have been a reason for this with the large number of categorical values, we created very sparce matrices which isn't ideal for logistic regression.