In [3]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine

#Create SQLAlchemy engine to connect to SQLite database
engine = create_engine("sqlite:///open_university_small.sqlite")

In [4]:
#Query to select data from 'studentInfo' table
query = "SELECT * FROM studentInfo"

#Read data from 'studentInfo' table in dataframe
df = pd.read_sql_query(query, engine)

#Close SQLAlchemy engine

In [5]:
#Display dataframe
df.head()

Unnamed: 0,index,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [6]:
#New dataframe with only relevant columns
new_df = df[['region', 'highest_education', 'imd_band', 'final_result']]
new_df.head()

Unnamed: 0,region,highest_education,imd_band,final_result
0,East Anglian Region,HE Qualification,90-100%,Pass
1,Scotland,HE Qualification,20-30%,Pass
2,North Western Region,A Level or Equivalent,30-40%,Withdrawn
3,South East Region,A Level or Equivalent,50-60%,Pass
4,West Midlands Region,Lower Than A Level,50-60%,Pass


In [7]:
#unique values in each column
df.nunique()

index                   32593
code_module                 7
code_presentation           4
id_student              28785
gender                      2
region                     13
highest_education           5
imd_band                   10
age_band                    3
num_of_prev_attempts        7
studied_credits            61
disability                  2
final_result                4
dtype: int64

In [8]:
#region value counts for binning
region_counts = new_df['region'].value_counts()
region_counts

Scotland                3446
East Anglian Region     3340
London Region           3216
South Region            3092
North Western Region    2906
West Midlands Region    2582
South West Region       2436
East Midlands Region    2365
South East Region       2111
Wales                   2086
Yorkshire Region        2006
North Region            1823
Ireland                 1184
Name: region, dtype: int64

In [9]:
#highest_education value counts for binning
highest_education_counts = new_df['highest_education'].value_counts()
highest_education_counts

A Level or Equivalent          14045
Lower Than A Level             13158
HE Qualification                4730
No Formal quals                  347
Post Graduate Qualification      313
Name: highest_education, dtype: int64

In [10]:
#imd_band value counts for binning
imd_band_counts = new_df['imd_band'].value_counts()
imd_band_counts

20-30%     3654
30-40%     3539
10-20      3516
0-10%      3311
40-50%     3256
50-60%     3124
60-70%     2905
70-80%     2879
80-90%     2762
90-100%    2536
Name: imd_band, dtype: int64

In [11]:
#final_result value counts for binning
final_result_counts = new_df['final_result'].value_counts()
final_result_counts

Pass           12361
Withdrawn      10156
Fail            7052
Distinction     3024
Name: final_result, dtype: int64

LOGISTIC REGRESSION MODEL IMD_BAND AND FINAL_RESULT

In [12]:
#Import modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import tree

In [13]:
#Dataframe for logistic regression model 
log_reg_df = df[['imd_band', 'final_result']]
log_reg_df.head()

Unnamed: 0,imd_band,final_result
0,90-100%,Pass
1,20-30%,Pass
2,30-40%,Withdrawn
3,50-60%,Pass
4,50-60%,Pass


In [14]:
#convert categorical data to numerical values; final_result
log_reg_df=log_reg_df.replace(['Pass', 'Withdrawn', 'Fail', 'Distinction'], [1, 0, 0, 1])
log_reg_df.head()

Unnamed: 0,imd_band,final_result
0,90-100%,1
1,20-30%,1
2,30-40%,0
3,50-60%,1
4,50-60%,1


In [15]:
# Split the data into X (feature) and y (target)
y = log_reg_df['final_result']
X = log_reg_df.drop(columns=['final_result'])


In [16]:
#Encode the categorical data 
X = pd.get_dummies(X)

In [17]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [19]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [20]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[8779 4177]
 [6477 5011]]


In [21]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[2909 1343]
 [2235 1662]]


In [22]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.58      0.68      0.62     12956
           1       0.55      0.44      0.48     11488

    accuracy                           0.56     24444
   macro avg       0.56      0.56      0.55     24444
weighted avg       0.56      0.56      0.56     24444



In [23]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.57      0.68      0.62      4252
           1       0.55      0.43      0.48      3897

    accuracy                           0.56      8149
   macro avg       0.56      0.56      0.55      8149
weighted avg       0.56      0.56      0.55      8149



PREDICTING FINAL_RESULT BASED ON STUDENT CHARACTERISTICS

In [24]:
#create new dataframe
stu_char_df = df[['region', 'highest_education', 'imd_band', 'gender', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result']]
stu_char_df.head()

Unnamed: 0,region,highest_education,imd_band,gender,num_of_prev_attempts,studied_credits,disability,final_result
0,East Anglian Region,HE Qualification,90-100%,M,0,240,N,Pass
1,Scotland,HE Qualification,20-30%,F,0,60,N,Pass
2,North Western Region,A Level or Equivalent,30-40%,F,0,60,Y,Withdrawn
3,South East Region,A Level or Equivalent,50-60%,F,0,60,N,Pass
4,West Midlands Region,Lower Than A Level,50-60%,F,0,60,N,Pass


In [25]:
#convert categorical data to numerical values; final_result
stu_char_df=stu_char_df.replace(['Pass', 'Withdrawn', 'Fail', 'Distinction'], [1, 0, 0, 1])
stu_char_df.head()

Unnamed: 0,region,highest_education,imd_band,gender,num_of_prev_attempts,studied_credits,disability,final_result
0,East Anglian Region,HE Qualification,90-100%,M,0,240,N,1
1,Scotland,HE Qualification,20-30%,F,0,60,N,1
2,North Western Region,A Level or Equivalent,30-40%,F,0,60,Y,0
3,South East Region,A Level or Equivalent,50-60%,F,0,60,N,1
4,West Midlands Region,Lower Than A Level,50-60%,F,0,60,N,1


In [26]:
# Split the data into X (feature) and y (target)
y = stu_char_df['final_result']
X = stu_char_df.drop(columns=['final_result'])

#Encode the categorical data 
X = pd.get_dummies(X)

# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [27]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [30]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[8444 4440]
 [5126 6434]]


In [31]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[2829 1495]
 [1682 2143]]


In [32]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.62      0.66      0.64     12884
           1       0.59      0.56      0.57     11560

    accuracy                           0.61     24444
   macro avg       0.61      0.61      0.61     24444
weighted avg       0.61      0.61      0.61     24444



In [33]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.63      0.65      0.64      4324
           1       0.59      0.56      0.57      3825

    accuracy                           0.61      8149
   macro avg       0.61      0.61      0.61      8149
weighted avg       0.61      0.61      0.61      8149

