In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine

#Create SQLAlchemy engine to connect to SQLite database
engine = create_engine("sqlite:///open_university_small.sqlite")

In [2]:
#Query to select data from 'studentInfo' table
query = "SELECT * FROM studentInfo"

#Read data from 'studentInfo' table in dataframe
df = pd.read_sql_query(query, engine)

#Close SQLAlchemy engine

In [3]:
#Display dataframe
df.head()

Unnamed: 0,index,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [4]:
#New dataframe with only relevant columns
new_df = df[['region', 'highest_education', 'imd_band', 'final_result']]
new_df.head()

Unnamed: 0,region,highest_education,imd_band,final_result
0,East Anglian Region,HE Qualification,90-100%,Pass
1,Scotland,HE Qualification,20-30%,Pass
2,North Western Region,A Level or Equivalent,30-40%,Withdrawn
3,South East Region,A Level or Equivalent,50-60%,Pass
4,West Midlands Region,Lower Than A Level,50-60%,Pass


In [5]:
#unique values in each column
df.nunique()

index                   32593
code_module                 7
code_presentation           4
id_student              28785
gender                      2
region                     13
highest_education           5
imd_band                   10
age_band                    3
num_of_prev_attempts        7
studied_credits            61
disability                  2
final_result                4
dtype: int64

In [6]:
#region value counts for binning
region_counts = new_df['region'].value_counts()
region_counts

Scotland                3446
East Anglian Region     3340
London Region           3216
South Region            3092
North Western Region    2906
West Midlands Region    2582
South West Region       2436
East Midlands Region    2365
South East Region       2111
Wales                   2086
Yorkshire Region        2006
North Region            1823
Ireland                 1184
Name: region, dtype: int64

In [7]:
#highest_education value counts for binning
highest_education_counts = new_df['highest_education'].value_counts()
highest_education_counts

A Level or Equivalent          14045
Lower Than A Level             13158
HE Qualification                4730
No Formal quals                  347
Post Graduate Qualification      313
Name: highest_education, dtype: int64

In [8]:
#imd_band value counts for binning
imd_band_counts = new_df['imd_band'].value_counts()
imd_band_counts

20-30%     3654
30-40%     3539
10-20      3516
0-10%      3311
40-50%     3256
50-60%     3124
60-70%     2905
70-80%     2879
80-90%     2762
90-100%    2536
Name: imd_band, dtype: int64

In [9]:
#final_result value counts for binning
final_result_counts = new_df['final_result'].value_counts()
final_result_counts

Pass           12361
Withdrawn      10156
Fail            7052
Distinction     3024
Name: final_result, dtype: int64

LOGISTIC REGRESSION MODEL IMD_BAND AND FINAL_RESULT

In [10]:
#Import modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import tree

In [11]:
#Dataframe for logistic regression model 
log_reg_df = df[['imd_band', 'final_result']]
log_reg_df.head()

Unnamed: 0,imd_band,final_result
0,90-100%,Pass
1,20-30%,Pass
2,30-40%,Withdrawn
3,50-60%,Pass
4,50-60%,Pass


In [12]:
#convert categorical data to numerical values; final_result
log_reg_df=log_reg_df.replace(['Pass', 'Withdrawn', 'Fail', 'Distinction'], [1, 0, 0, 1])
log_reg_df.head()

Unnamed: 0,imd_band,final_result
0,90-100%,1
1,20-30%,1
2,30-40%,0
3,50-60%,1
4,50-60%,1


In [13]:
# Split the data into X (feature) and y (target)
y = log_reg_df['final_result']
X = log_reg_df.drop(columns=['final_result'])


In [14]:
#Encode the categorical data 
X = pd.get_dummies(X)

In [15]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [17]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [18]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[8756 4095]
 [6551 5042]]


In [19]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[2932 1425]
 [2161 1631]]


In [20]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.57      0.68      0.62     12851
           1       0.55      0.43      0.49     11593

    accuracy                           0.56     24444
   macro avg       0.56      0.56      0.55     24444
weighted avg       0.56      0.56      0.56     24444



In [21]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.58      0.67      0.62      4357
           1       0.53      0.43      0.48      3792

    accuracy                           0.56      8149
   macro avg       0.55      0.55      0.55      8149
weighted avg       0.56      0.56      0.55      8149



PREDICTING FINAL_RESULT BASED ON STUDENT CHARACTERISTICS

Preprocessing Data

In [22]:
#create new dataframe
stu_char_df = df[['region', 'highest_education', 'imd_band', 'gender', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']]
stu_char_df.head()

Unnamed: 0,region,highest_education,imd_band,gender,num_of_prev_attempts,studied_credits,disability,final_result
0,East Anglian Region,HE Qualification,90-100%,M,0,240,N,Pass
1,Scotland,HE Qualification,20-30%,F,0,60,N,Pass
2,North Western Region,A Level or Equivalent,30-40%,F,0,60,Y,Withdrawn
3,South East Region,A Level or Equivalent,50-60%,F,0,60,N,Pass
4,West Midlands Region,Lower Than A Level,50-60%,F,0,60,N,Pass


In [23]:
#convert categorical data to numerical values; final_result
stu_char_df=stu_char_df.replace(['Pass', 'Withdrawn', 'Fail', 'Distinction'], [1, 0, 0, 1])
stu_char_df.head()

Unnamed: 0,region,highest_education,imd_band,gender,num_of_prev_attempts,studied_credits,disability,final_result
0,East Anglian Region,HE Qualification,90-100%,M,0,240,N,1
1,Scotland,HE Qualification,20-30%,F,0,60,N,1
2,North Western Region,A Level or Equivalent,30-40%,F,0,60,Y,0
3,South East Region,A Level or Equivalent,50-60%,F,0,60,N,1
4,West Midlands Region,Lower Than A Level,50-60%,F,0,60,N,1


In [24]:
#unique values in each column
stu_char_df.nunique()

region                  13
highest_education        5
imd_band                10
gender                   2
num_of_prev_attempts     7
studied_credits         61
disability               2
final_result             2
dtype: int64

In [25]:
#number of rows in each column
stu_char_df.count()

region                  32593
highest_education       32593
imd_band                31482
gender                  32593
num_of_prev_attempts    32593
studied_credits         32593
disability              32593
final_result            32593
dtype: int64

In [26]:
#number of null values in each column
stu_char_df.isnull().sum()

region                     0
highest_education          0
imd_band                1111
gender                     0
num_of_prev_attempts       0
studied_credits            0
disability                 0
final_result               0
dtype: int64

In [27]:
#values in each column
list = ['region', 'highest_education', 'imd_band', 'gender', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']
for i in list:
    print(stu_char_df[i].unique())

['East Anglian Region' 'Scotland' 'North Western Region'
 'South East Region' 'West Midlands Region' 'Wales' 'North Region'
 'South Region' 'Ireland' 'South West Region' 'East Midlands Region'
 'Yorkshire Region' 'London Region']
['HE Qualification' 'A Level or Equivalent' 'Lower Than A Level'
 'Post Graduate Qualification' 'No Formal quals']
['90-100%' '20-30%' '30-40%' '50-60%' '80-90%' '70-80%' None '60-70%'
 '40-50%' '10-20' '0-10%']
['M' 'F']
[0 1 2 4 3 5 6]
[240  60 120  90 150 180 345 420 170  80  75 300 330 210 270 360 135  70
 225 585 325 130 195 105 655 165 100 390 220 160 250  30  40  45 400 235
 145 630 355  50 110 115  55  85 480 280 175  95 155 190 315 200 140 540
 310 370 205 215 255  65 430]
['N' 'Y']
[1 0]


In [28]:
#remove rows with null values
stu_char_df_n = stu_char_df.dropna()
stu_char_df_n.isnull().sum()

region                  0
highest_education       0
imd_band                0
gender                  0
num_of_prev_attempts    0
studied_credits         0
disability              0
final_result            0
dtype: int64

In [29]:
#remove rows with 'None' in 'imd_band'
stu_char_df_n = stu_char_df_n[stu_char_df_n['imd_band'] != 'None']
for i in list:
    print(stu_char_df_n[i].unique())

['East Anglian Region' 'Scotland' 'North Western Region'
 'South East Region' 'West Midlands Region' 'Wales' 'South Region'
 'South West Region' 'East Midlands Region' 'Yorkshire Region'
 'London Region' 'North Region' 'Ireland']
['HE Qualification' 'A Level or Equivalent' 'Lower Than A Level'
 'Post Graduate Qualification' 'No Formal quals']
['90-100%' '20-30%' '30-40%' '50-60%' '80-90%' '70-80%' '60-70%' '40-50%'
 '10-20' '0-10%']
['M' 'F']
[0 1 2 4 3 5 6]
[240  60 120  90 150 180 345 420 170  80  75 300 330 210 270 360 135  70
 225 585 325 130 195 105 655 165 100 390 220 160 250  30  40  45 400 235
 145 630 355  50 110 115  55 280 175  95 155 190 200 140 540 310  85 370
 205 215 255  65 430]
['N' 'Y']
[1 0]


In [30]:
#count of unique values in 'studied_credits'
stu_char_df_n['studied_credits'].value_counts()


60     16157
120     6143
30      3644
90      3003
180      810
150      739
240      219
210      164
75       108
70        88
270       41
45        36
135       35
130       35
300       27
100       27
80        25
40        21
105       13
160       13
110       11
195       10
330       10
165        9
220        9
50         7
145        7
360        6
170        6
190        6
140        5
115        5
95         3
280        3
55         3
85         3
200        3
250        3
225        3
400        2
205        2
370        1
310        1
540        1
215        1
255        1
65         1
325        1
155        1
175        1
585        1
355        1
630        1
235        1
345        1
420        1
390        1
655        1
430        1
Name: studied_credits, dtype: int64

In [31]:
#binning 'studied_credits'
bins = [0, 60, 120, 180, 240, 300]
labels = ['0-60', '61-120', '121-180', '181-240', '241-300']
stu_char_df_n['studied_credits'] = pd.cut(stu_char_df_n['studied_credits'], bins=bins, labels=labels)


Fitting the Log Reg Model

In [32]:
# Split the data into X (feature) and y (target)
y = stu_char_df_n['final_result']
X = stu_char_df_n.drop(columns=['final_result'])

#Encode the categorical data 
X = pd.get_dummies(X)

# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [33]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [34]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9, max_iter=1000, solver='newton-cg')

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [35]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [36]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[8583 4036]
 [5181 5811]]


In [37]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[2822 1386]
 [1741 1922]]


In [38]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.62      0.68      0.65     12619
           1       0.59      0.53      0.56     10992

    accuracy                           0.61     23611
   macro avg       0.61      0.60      0.60     23611
weighted avg       0.61      0.61      0.61     23611



In [39]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.62      0.67      0.64      4208
           1       0.58      0.52      0.55      3663

    accuracy                           0.60      7871
   macro avg       0.60      0.60      0.60      7871
weighted avg       0.60      0.60      0.60      7871



DECISION TREE

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load your data
y = stu_char_df_n['final_result']
X = stu_char_df_n.drop(['final_result'], axis=1)

#Encode the categorical data 
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Decision Tree model
tree_model = DecisionTreeClassifier()

# Train the model
tree_model.fit(X_train, y_train)

# Predict on the test set
y_pred = tree_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the Decision Tree model: {accuracy:.2f}')


Accuracy of the Decision Tree model: 0.58


In [41]:
#Visualize the tree
from sklearn import tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
tree.plot_tree(tree_model, filled=True, feature_names=X.columns, class_names=str(tree_model.classes_))
plt.show()


InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['num_of_prev_attempts', 'region_East Anglian Region',
       'region_East Midlands Region', 'region_Ireland', 'region_London Region',
       'region_North Region', 'region_North Western Region', 'region_Scotland',
       'region_South East Region', 'region_South Region',
       'region_South West Region', 'region_Wales',
       'region_West Midlands Region', 'region_Yorkshire Region',
       'highest_education_A Level or Equivalent',
       'highest_education_HE Qualification',
       'highest_education_Lower Than A Level',
       'highest_education_No Formal quals',
       'highest_education_Post Graduate Qualification', 'imd_band_0-10%',
       'imd_band_10-20', 'imd_band_20-30%', 'imd_band_30-40%',
       'imd_band_40-50%', 'imd_band_50-60%', 'imd_band_60-70%',
       'imd_band_70-80%', 'imd_band_80-90%', 'imd_band_90-100%', 'gender_F',
       'gender_M', 'studied_credits_0-60', 'studied_credits_61-120',
       'studied_credits_121-180', 'studied_credits_181-240',
       'studied_credits_241-300', 'disability_N', 'disability_Y'],
      dtype='object') instead.

<Figure size 2000x1000 with 0 Axes>