In [50]:
import pandas as pd 

import sqlalchemy
from sqlalchemy import create_engine
engine = create_engine("sqlite:///open_university.sqlite")
conn = engine.connect()

import warnings
warnings.filterwarnings('always') 
warnings.filterwarnings('ignore') 

from sklearn.linear_model import LogisticRegression
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

# Performing ETL, converting int into str, merging similar kind of datasets

In [51]:
courses = pd.read_sql("SELECT * FROM courses", conn)
courses = courses.drop('index', axis=1)
courses.head()

Unnamed: 0,code_module,code_presentation,module_presentation_length
0,AAA,2013J,268
1,AAA,2014J,269
2,BBB,2013J,268
3,BBB,2014J,262
4,BBB,2013B,240


In [52]:
assessments = pd.read_sql("SELECT * FROM assessments", conn)
assessments = assessments.drop('index', axis=1)
assessments.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0


In [53]:
assessments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code_module        206 non-null    object 
 1   code_presentation  206 non-null    object 
 2   id_assessment      206 non-null    int64  
 3   assessment_type    206 non-null    object 
 4   date               195 non-null    float64
 5   weight             206 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 9.8+ KB


In [54]:
assessments['id_assessment'] = assessments['id_assessment'].apply(lambda x: str(int(x)))

In [55]:
studentAssessment = pd.read_sql("SELECT * FROM studentAssessment", conn)
studentAssessment = studentAssessment.drop('index', axis=1)
studentAssessment.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0


In [56]:
studentAssessment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id_assessment   173912 non-null  int64  
 1   id_student      173912 non-null  int64  
 2   date_submitted  173912 non-null  int64  
 3   is_banked       173912 non-null  int64  
 4   score           173739 non-null  float64
dtypes: float64(1), int64(4)
memory usage: 6.6 MB


In [57]:
studentAssessment['id_assessment'] = studentAssessment['id_assessment'].apply(lambda x: str(int(x)))
studentAssessment['id_student'] = studentAssessment['id_student'].apply(lambda x: str(int(x)))

In [58]:
# merging the two dataframes
df_assessments_merged = assessments.merge(studentAssessment, how = 'right')
df_assessments_merged.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight,id_student,date_submitted,is_banked,score
0,AAA,2013J,1752,TMA,19.0,10.0,11391,18,0,78.0
1,AAA,2013J,1752,TMA,19.0,10.0,28400,22,0,70.0
2,AAA,2013J,1752,TMA,19.0,10.0,31604,17,0,72.0
3,AAA,2013J,1752,TMA,19.0,10.0,32885,26,0,69.0
4,AAA,2013J,1752,TMA,19.0,10.0,38053,19,0,79.0


In [59]:
df_assessments_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   code_module        173912 non-null  object 
 1   code_presentation  173912 non-null  object 
 2   id_assessment      173912 non-null  object 
 3   assessment_type    173912 non-null  object 
 4   date               171047 non-null  float64
 5   weight             173912 non-null  float64
 6   id_student         173912 non-null  object 
 7   date_submitted     173912 non-null  int64  
 8   is_banked          173912 non-null  int64  
 9   score              173739 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 13.3+ MB


In [60]:
# create column with weighted score
df_assessments_merged['weighted_score'] = (df_assessments_merged['weight']*df_assessments_merged['score'])/100

In [61]:
df_assessments_merged.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight,id_student,date_submitted,is_banked,score,weighted_score
0,AAA,2013J,1752,TMA,19.0,10.0,11391,18,0,78.0,7.8
1,AAA,2013J,1752,TMA,19.0,10.0,28400,22,0,70.0,7.0
2,AAA,2013J,1752,TMA,19.0,10.0,31604,17,0,72.0,7.2
3,AAA,2013J,1752,TMA,19.0,10.0,32885,26,0,69.0,6.9
4,AAA,2013J,1752,TMA,19.0,10.0,38053,19,0,79.0,7.9


In [62]:
assessments_final = sqldf("""SELECT code_module, code_presentation, id_student, SUM(weighted_score) as assessment_score FROM df_assessments_merged
            GROUP BY code_module, code_presentation, id_student""")

In [63]:
# Merge student_info with student_registration
studentInfo = pd.read_sql("SELECT * FROM studentInfo", conn)
studentInfo = studentInfo.drop('index', axis=1)
studentInfo.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [64]:
studentInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   code_module           32593 non-null  object
 1   code_presentation     32593 non-null  object
 2   id_student            32593 non-null  int64 
 3   gender                32593 non-null  object
 4   region                32593 non-null  object
 5   highest_education     32593 non-null  object
 6   imd_band              31482 non-null  object
 7   age_band              32593 non-null  object
 8   num_of_prev_attempts  32593 non-null  int64 
 9   studied_credits       32593 non-null  int64 
 10  disability            32593 non-null  object
 11  final_result          32593 non-null  object
dtypes: int64(3), object(9)
memory usage: 3.0+ MB


In [65]:
studentInfo['id_student'] = studentInfo['id_student'].apply(lambda x: str(int(x)))

In [66]:
studentRegistration = pd.read_sql("SELECT * FROM studentRegistration", conn)
studentRegistration = studentRegistration.drop('index', axis=1)
studentRegistration.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,


In [67]:
studentRegistration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_module          32593 non-null  object 
 1   code_presentation    32593 non-null  object 
 2   id_student           32593 non-null  int64  
 3   date_registration    32548 non-null  float64
 4   date_unregistration  10072 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [68]:
studentRegistration['id_student'] = studentRegistration['id_student'].apply(lambda x: str(int(x)));

In [69]:
# merging the two dataframes
student_final = studentInfo.merge(studentRegistration, how = 'left')
student_final.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,


In [70]:
student_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   code_module           32593 non-null  object 
 1   code_presentation     32593 non-null  object 
 2   id_student            32593 non-null  object 
 3   gender                32593 non-null  object 
 4   region                32593 non-null  object 
 5   highest_education     32593 non-null  object 
 6   imd_band              31482 non-null  object 
 7   age_band              32593 non-null  object 
 8   num_of_prev_attempts  32593 non-null  int64  
 9   studied_credits       32593 non-null  int64  
 10  disability            32593 non-null  object 
 11  final_result          32593 non-null  object 
 12  date_registration     32548 non-null  float64
 13  date_unregistration   10072 non-null  float64
dtypes: float64(2), int64(2), object(10)
memory usage: 3.5+ MB


In [71]:
# delete uneccessary information
del student_final['date_unregistration']

In [72]:
vle = pd.read_sql("SELECT * FROM vle", conn)
vle = vle.drop('index', axis=1)
vle.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,,
1,546712,AAA,2013J,oucontent,,
2,546998,AAA,2013J,resource,,
3,546888,AAA,2013J,url,,
4,547035,AAA,2013J,resource,,


In [73]:
vle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6364 entries, 0 to 6363
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id_site            6364 non-null   int64  
 1   code_module        6364 non-null   object 
 2   code_presentation  6364 non-null   object 
 3   activity_type      6364 non-null   object 
 4   week_from          1121 non-null   float64
 5   week_to            1121 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 298.4+ KB


In [74]:
vle['id_site'] = vle['id_site'].apply(lambda x: str(int(x)))

In [75]:
studentVle = pd.read_sql("SELECT * FROM studentVle", conn)
studentVle = studentVle.drop('index', axis=1)
studentVle.head()

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1


In [76]:
studentVle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10655280 entries, 0 to 10655279
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code_module        object
 1   code_presentation  object
 2   id_student         int64 
 3   id_site            int64 
 4   date               int64 
 5   sum_click          int64 
dtypes: int64(4), object(2)
memory usage: 487.8+ MB


In [77]:
studentVle['id_site'] = studentVle['id_site'].apply(lambda x: str(int(x)))
studentVle['id_student'] = studentVle['id_student'].apply(lambda x: str(int(x)))

In [78]:
# merging the two dataframes
vle_merged = vle.merge(studentVle, how = 'right')
vle_merged.head()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to,id_student,date,sum_click
0,546652,AAA,2013J,forumng,,,28400,-10,4
1,546652,AAA,2013J,forumng,,,28400,-10,1
2,546652,AAA,2013J,forumng,,,28400,-10,1
3,546614,AAA,2013J,homepage,,,28400,-10,11
4,546714,AAA,2013J,oucontent,,,28400,-10,1


In [79]:
vle_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10655280 entries, 0 to 10655279
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   id_site            object 
 1   code_module        object 
 2   code_presentation  object 
 3   activity_type      object 
 4   week_from          float64
 5   week_to            float64
 6   id_student         object 
 7   date               int64  
 8   sum_click          int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 731.6+ MB


We want total clicks of every student grouped by each module and year. Therefore we are going to create a dataframe without activity_type column. 

In [80]:
vle_clicks = sqldf("""SELECT code_module, code_presentation, id_student, sum(sum_click) as total_clicks FROM vle_merged
            GROUP BY code_module, code_presentation, id_student""")

In [81]:
vle_clicks.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29228 entries, 0 to 29227
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   code_module        29228 non-null  object
 1   code_presentation  29228 non-null  object
 2   id_student         29228 non-null  object
 3   total_clicks       29228 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 913.5+ KB


## Final Dataframe

Now, it is easy to merge all of them in one dataframe.

In [82]:
final_df = pd.merge(pd.merge(student_final,assessments_final,how='left'),vle_clicks,how='left')
final_df.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,assessment_score,total_clicks
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,82.4,934.0
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,65.4,1435.0
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,-92.0,,281.0
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,76.3,2158.0
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,55.0,1034.0


In [83]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   code_module           32593 non-null  object 
 1   code_presentation     32593 non-null  object 
 2   id_student            32593 non-null  object 
 3   gender                32593 non-null  object 
 4   region                32593 non-null  object 
 5   highest_education     32593 non-null  object 
 6   imd_band              31482 non-null  object 
 7   age_band              32593 non-null  object 
 8   num_of_prev_attempts  32593 non-null  int64  
 9   studied_credits       32593 non-null  int64  
 10  disability            32593 non-null  object 
 11  final_result          32593 non-null  object 
 12  date_registration     32548 non-null  float64
 13  assessment_score      25820 non-null  float64
 14  total_clicks          29228 non-null  float64
dtypes: float64(3), int6

## Creating bins

We are going to categorize certain numerical columns in order to have a more easy visual manipulation of our dataset. Moreover, it would be easier to convert them in ordinal later for our model. 

### registration_date column
The days students have registered before each module varies from -322 to 110. We will categorize them as follow

registration_date   | Number of days
--- | ---
Very early | -322 - -160
Early | -161 - 0
Late | 1 - 160


In [84]:
## Defining the categories:
categories = pd.Series(['Very Early', 'Early', 'Late'])

## Applying these categories to the final dataframe:
final_df['registration'] = pd.cut(final_df.date_registration, bins = [-322,-160,0,160], labels=categories)

### studied_credits column

In [85]:
# find the min and max of the column
print('minimum:', final_df['studied_credits'].min(), end='\n')
print('maximum:', final_df['studied_credits'].max())

minimum: 30
maximum: 655


The credits of each student vary from 30 to 655. We will categorize them as follow

credits   | category
--- | ---
30 to 186 | 30-186
187 to 343 | 187-343
344 to 500 | 344-500
500 to 656 | 500-656

In [86]:
## Defining the categories label:
categories2 = pd.Series(['30-186', '187-343', '344-500', '500-656'])

## Applying these categories both to the auxiliary and to the working datasets:
final_df['studied_credits'] = pd.cut(final_df.studied_credits, bins = [30,186,343,500,656], labels=categories2)

### final_result column

We are going to categorise the final result into two categories. Those who withdrawn from the course with 1 and those who didn't with 0.

In [87]:
final_df = final_df.replace(['Pass', 'Withdrawn', 'Distinction', 'Fail'], [0, 1, 0, 0])    

### total_clicks column

In [88]:
# find the min and max of the column
print('minimum:', final_df['total_clicks'].min(), end='\n')
print('maximum:', final_df['total_clicks'].max())

minimum: 1.0
maximum: 24139.0


Total clicks of each student vary from 1 to 24139. We will categorize them as follow

clicks   | category
--- | ---
0 to 4023 | 0-4023
4024 to 8047 | 4024-8047
8048 to 12071 | 8048-12071
12072 to 16094 | 12072-16094
16095 to 20117 | 16095-20117
20118 to 24140 | 20118-24140

In [89]:
## Defining the categories label:
categories3 = pd.Series(['0-4.023', '4.024-8.047', '8.048-12.071', '12.072-16.094', '16.095-20.117', '20.118-24.140'])

## Applying these categories both to the auxiliary and to the working datasets:
final_df['total_clicks'] = pd.cut(final_df.total_clicks, bins = [0,4023,8047,12071,16094,20117,24140], labels=categories3)

### assessment_score column

In [90]:
# find the min and max of the column
print('minimum:', final_df['assessment_score'].min(), end='\n')
print('maximum:', final_df['assessment_score'].max())

minimum: 0.0
maximum: 200.0


Assessment score of each student vary from 0 to 200. We will categorize them as follow

score  | category
--- | ---
0 to 50 | 0-50
51 to 100 | 51-100
101 to 150 | 101-150
151 to 200 | 151-200

In [91]:
## Defining the categories label:
categories4 = pd.Series(['0-50', '51-100', '101-150', '151-200'])

## Applying these categories both to the auxiliary and to the working datasets:
final_df['assessment_score'] = pd.cut(final_df.assessment_score, bins = [0,50,100,150,200], labels=categories4)

In [92]:
# delete uneccessary information
del final_df['date_registration']
del final_df['id_student']
del final_df['code_presentation']

In [93]:
# convert all columns to object types
final_df['num_of_prev_attempts'] = final_df['num_of_prev_attempts'].apply(str)
final_df['final_result'] = final_df['final_result'].apply(str)
final_df["studied_credits"] = final_df["studied_credits"].astype("str")
final_df["assessment_score"] = final_df["assessment_score"].astype("str")
final_df["total_clicks"] = final_df["total_clicks"].astype("str")
final_df["registration"] = final_df["registration"].astype("str")
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   code_module           32593 non-null  object
 1   gender                32593 non-null  object
 2   region                32593 non-null  object
 3   highest_education     32593 non-null  object
 4   imd_band              31482 non-null  object
 5   age_band              32593 non-null  object
 6   num_of_prev_attempts  32593 non-null  object
 7   studied_credits       32593 non-null  object
 8   disability            32593 non-null  object
 9   final_result          32593 non-null  object
 10  assessment_score      32593 non-null  object
 11  total_clicks          32593 non-null  object
 12  registration          32593 non-null  object
dtypes: object(13)
memory usage: 3.2+ MB


In [94]:
final_df.head()

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
0,AAA,M,East Anglian Region,HE Qualification,90-100%,55<=,0,187-343,N,0,51-100,0-4.023,Early
1,AAA,F,Scotland,HE Qualification,20-30%,35-55,0,30-186,N,0,51-100,0-4.023,Early
2,AAA,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,30-186,Y,1,,0-4.023,Early
3,AAA,F,South East Region,A Level or Equivalent,50-60%,35-55,0,30-186,N,0,51-100,0-4.023,Early
4,AAA,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,30-186,N,0,51-100,0-4.023,Very Early


## Training And Testing Dataframes

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
# get the locations
X = final_df.iloc[:, :]
y = final_df.iloc[:, :]
 
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0)

In [97]:
X_train.describe(include=['O'])

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
count,22815,22815,22815,22815,22043,22815,22815,22815,22815,22815,22815,22815,22815
unique,7,2,13,5,10,3,7,5,2,2,5,7,4
top,BBB,M,Scotland,A Level or Equivalent,20-30%,0-35,0,30-186,N,0,51-100,0-4.023,Early
freq,5550,12454,2381,9859,2573,16083,19859,19842,20622,15696,7844,18989,21614


## Replacing all values with ordinal values.

In [98]:
combine = [X_train, X_test]

In [99]:
module_converting = {'AAA':1, 'BBB':2, 'CCC':3, 'DDD':4, 'EEE':5, 'FFF':6, 'GGG':7}
for dataset in combine:
    dataset['code_module'] = dataset['code_module'].map(module_converting)
    dataset['code_module'] = dataset['code_module'].fillna(2)

X_train.head()

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,F,North Region,A Level or Equivalent,70-80%,0-35,0,30-186,N,0,0-50,0-4.023,Early
26235,6,M,London Region,A Level or Equivalent,40-50%,35-55,0,30-186,N,1,,0-4.023,Early
12392,3,M,North Region,A Level or Equivalent,0-10%,0-35,0,30-186,N,1,,,Early
20245,5,F,South Region,A Level or Equivalent,70-80%,0-35,0,30-186,N,0,51-100,0-4.023,Early
20005,5,M,North Region,A Level or Equivalent,30-40%,0-35,0,30-186,N,0,0-50,0-4.023,Early


In [100]:
module_converting = {'F':0, 'M':1}
for dataset in combine:
    dataset['gender'] = dataset['gender'].map(module_converting)
    dataset['gender'] = dataset['gender'].fillna(1)

X_train.head()

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,A Level or Equivalent,70-80%,0-35,0,30-186,N,0,0-50,0-4.023,Early
26235,6,1,London Region,A Level or Equivalent,40-50%,35-55,0,30-186,N,1,,0-4.023,Early
12392,3,1,North Region,A Level or Equivalent,0-10%,0-35,0,30-186,N,1,,,Early
20245,5,0,South Region,A Level or Equivalent,70-80%,0-35,0,30-186,N,0,51-100,0-4.023,Early
20005,5,1,North Region,A Level or Equivalent,30-40%,0-35,0,30-186,N,0,0-50,0-4.023,Early


In [101]:
module_converting = {'HE Qualification':1, 'A Level or Equivalent':2, 'Lower Than A Level':3, 'Post Graduate Qualification':4, 'No Formal quals':5}
for dataset in combine:
    dataset['highest_education'] = dataset['highest_education'].map(module_converting)
    dataset['highest_education'] = dataset['highest_education'].fillna(2)

X_train.head()  
 

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,2,70-80%,0-35,0,30-186,N,0,0-50,0-4.023,Early
26235,6,1,London Region,2,40-50%,35-55,0,30-186,N,1,,0-4.023,Early
12392,3,1,North Region,2,0-10%,0-35,0,30-186,N,1,,,Early
20245,5,0,South Region,2,70-80%,0-35,0,30-186,N,0,51-100,0-4.023,Early
20005,5,1,North Region,2,30-40%,0-35,0,30-186,N,0,0-50,0-4.023,Early


In [102]:
module_converting = {'0-10%':0, '10-20%':1, '20-30%':2, '30-40%':3, '40-50%':4, '50-60%':5, '60-70%':6, '70-80%':7, '80-90%':8, '90-100%':9}
for dataset in combine:
    dataset['imd_band'] = dataset['imd_band'].map(module_converting)
    dataset['imd_band'] = dataset['imd_band'].fillna(2)

X_train.head()      
  

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,2,7.0,0-35,0,30-186,N,0,0-50,0-4.023,Early
26235,6,1,London Region,2,4.0,35-55,0,30-186,N,1,,0-4.023,Early
12392,3,1,North Region,2,0.0,0-35,0,30-186,N,1,,,Early
20245,5,0,South Region,2,7.0,0-35,0,30-186,N,0,51-100,0-4.023,Early
20005,5,1,North Region,2,3.0,0-35,0,30-186,N,0,0-50,0-4.023,Early


In [103]:
module_converting = {'0-35':1, '35-55':2, '55<=':3}
for dataset in combine:
    dataset['age_band'] = dataset['age_band'].map(module_converting)
    dataset['age_band'] = dataset['age_band'].fillna(1)

X_train.head()    

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,2,7.0,1,0,30-186,N,0,0-50,0-4.023,Early
26235,6,1,London Region,2,4.0,2,0,30-186,N,1,,0-4.023,Early
12392,3,1,North Region,2,0.0,1,0,30-186,N,1,,,Early
20245,5,0,South Region,2,7.0,1,0,30-186,N,0,51-100,0-4.023,Early
20005,5,1,North Region,2,3.0,1,0,30-186,N,0,0-50,0-4.023,Early


In [104]:
module_converting = {'Very Early':1, 'Early':2, 'Late':3}
for dataset in combine:
    dataset['registration'] = dataset['registration'].map(module_converting)
    dataset['registration'] = dataset['registration'].fillna(2)

X_train.head()    

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,2,7.0,1,0,30-186,N,0,0-50,0-4.023,2.0
26235,6,1,London Region,2,4.0,2,0,30-186,N,1,,0-4.023,2.0
12392,3,1,North Region,2,0.0,1,0,30-186,N,1,,,2.0
20245,5,0,South Region,2,7.0,1,0,30-186,N,0,51-100,0-4.023,2.0
20005,5,1,North Region,2,3.0,1,0,30-186,N,0,0-50,0-4.023,2.0


In [105]:
module_converting = {'N':0, 'Y':1}
for dataset in combine:
    dataset['disability'] = dataset['disability'].map(module_converting)
    dataset['disability'] = dataset['disability'].fillna(0)

X_train.head()

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,North Region,2,7.0,1,0,30-186,0,0,0-50,0-4.023,2.0
26235,6,1,London Region,2,4.0,2,0,30-186,0,1,,0-4.023,2.0
12392,3,1,North Region,2,0.0,1,0,30-186,0,1,,,2.0
20245,5,0,South Region,2,7.0,1,0,30-186,0,0,51-100,0-4.023,2.0
20005,5,1,North Region,2,3.0,1,0,30-186,0,0,0-50,0-4.023,2.0


We have many regions, so we are going to group them.. Regions that are close to each other are categorised together.

In [106]:
module_converting = {'Yorkshire Region':1, 'Scotland':1, 'North Western Region':1, 'North Region':1, 'Ireland':2, 'South Region':3,  'South West Region':3, 'South East Region':3, 'West Midlands Region':3, 'London Region':3, 'East Anglian Region':4, 'East Midlands Region':4, 'Wales':4}
for dataset in combine:
    dataset['region'] = dataset['region'].map(module_converting)
    dataset['region'] = dataset['region'].fillna(1)

X_train.head()     

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,1,2,7.0,1,0,30-186,0,0,0-50,0-4.023,2.0
26235,6,1,3,2,4.0,2,0,30-186,0,1,,0-4.023,2.0
12392,3,1,1,2,0.0,1,0,30-186,0,1,,,2.0
20245,5,0,3,2,7.0,1,0,30-186,0,0,51-100,0-4.023,2.0
20005,5,1,1,2,3.0,1,0,30-186,0,0,0-50,0-4.023,2.0


In [107]:
module_converting = {'0-4.023':1, '4.024-8.047':2, '8.048-12.071':3, '12.072-16.094':4, '16.095-20.117':5, '20.118-24.140':6}
for dataset in combine:
    dataset['total_clicks'] = dataset['total_clicks'].map(module_converting)
    dataset['total_clicks'] = dataset['total_clicks'].fillna(1)

X_train.head()   

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,1,2,7.0,1,0,30-186,0,0,0-50,1.0,2.0
26235,6,1,3,2,4.0,2,0,30-186,0,1,,1.0,2.0
12392,3,1,1,2,0.0,1,0,30-186,0,1,,1.0,2.0
20245,5,0,3,2,7.0,1,0,30-186,0,0,51-100,1.0,2.0
20005,5,1,1,2,3.0,1,0,30-186,0,0,0-50,1.0,2.0


In [108]:
module_converting = {'0-50':1, '51-100':2, '101-150':3, '151-200':4}
for dataset in combine:
    dataset['assessment_score'] = dataset['assessment_score'].map(module_converting)
    dataset['assessment_score'] = dataset['assessment_score'].fillna(1)

X_train.head() 

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,1,2,7.0,1,0,30-186,0,0,1.0,1.0,2.0
26235,6,1,3,2,4.0,2,0,30-186,0,1,1.0,1.0,2.0
12392,3,1,1,2,0.0,1,0,30-186,0,1,1.0,1.0,2.0
20245,5,0,3,2,7.0,1,0,30-186,0,0,2.0,1.0,2.0
20005,5,1,1,2,3.0,1,0,30-186,0,0,1.0,1.0,2.0


In [109]:
module_converting = {'30-186':1, '187-343':2, '344-500':3, '500-656':4}
for dataset in combine:
    dataset['studied_credits'] = dataset['studied_credits'].map(module_converting)
    dataset['studied_credits'] = dataset['studied_credits'].fillna(1)
X_train.head()   

Unnamed: 0,code_module,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,assessment_score,total_clicks,registration
4166,2,0,1,2,7.0,1,0,1.0,0,0,1.0,1.0,2.0
26235,6,1,3,2,4.0,2,0,1.0,0,1,1.0,1.0,2.0
12392,3,1,1,2,0.0,1,0,1.0,0,1,1.0,1.0,2.0
20245,5,0,3,2,7.0,1,0,1.0,0,0,2.0,1.0,2.0
20005,5,1,1,2,3.0,1,0,1.0,0,0,1.0,1.0,2.0


# Using Logistic regression model for prediction

In [110]:
X_train_r = X_train.drop("final_result", axis=1)
Y_train_r = X_train["final_result"]
X_test_r = X_test.drop("final_result", axis=1).copy()
X_train_r.shape, Y_train_r.shape, X_test_r.shape

((22815, 12), (22815,), (9778, 12))

Accuracy is 81.15

In [111]:
logreg = LogisticRegression()
logreg.fit(X_train_r, Y_train_r)
Y_pred = logreg.predict(X_test_r)
acc_log = round(logreg.score(X_train_r, Y_train_r) * 100, 2)
acc_log

81.15

**COEFFICIENT**
We can observe from the coefficients below that:
Disability has the highest positive correlation, that means that when the student has a disability,  increases the probability to drop out from the course. 
Assessment score has the highest negative correlation, which means that when Assessment score increases then probability to drop out of the course decreases.

In [112]:
coeff_df = pd.DataFrame(X_train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
7,disability,1.069622
1,region,0.554217
8,final_result,0.220115
4,age_band,0.011615
2,highest_education,0.006146
5,num_of_prev_attempts,-0.055242
3,imd_band,-0.058045
6,studied_credits,-0.20822
0,gender,-0.283133
10,total_clicks,-0.733081
