In [2]:
# Import dependencies.
import matplotlib
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import pandas as pd

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy import create_engine, text

In [3]:
engine = create_engine("sqlite:///open_university.sqlite", echo=False)

## Load the data

The plan is to just look at the interaction with the course material to see if the student will be able to predict if the student will pass or fail.

Not including information about the background of the student (demographics) or the scores of the student assessments but include if they submit the assessments or not.

We are going to filter out students that withdraw from the course before the start of the course.

In [4]:
student_ds = engine.execute(text("""
SELECT sI.code_module, sI.code_presentation, sI.id_student, sI.final_result, sR.date_registration, sR.date_unregistration
FROM studentInfo as sI
LEFT JOIN studentRegistration as sR ON sI.id_student = sR.id_student
WHERE NOT (sR.date_unregistration <= -11 AND sI.final_result = 'Withdrawn')
""")).fetchall()

# Remove all students that withdraw before 11 days before the course
# these students would not have interacted with the course material

student_df = pd.DataFrame(student_ds, columns=['code_module', 'code_presentation', 'id_student', 'final_result', 'date_registration', 'date_unregistration'])
student_df = student_df.astype({'code_module':'string', 'code_presentation':'string', 'id_student':'string', 'final_result':'string'})

In [5]:
display(student_df)
display(student_df.describe())
display(student_df.nunique())
display(student_df['final_result'].value_counts())
student_df.info()

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,Pass,-159.0,
1,AAA,2013J,28400,Pass,-53.0,
2,AAA,2013J,30268,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,Pass,-52.0,
4,AAA,2013J,32885,Pass,-176.0,
...,...,...,...,...,...,...
36683,GGG,2014J,2640965,Fail,-4.0,
36684,GGG,2014J,2645731,Distinction,-23.0,
36685,GGG,2014J,2648187,Pass,-129.0,
36686,GGG,2014J,2679821,Withdrawn,-49.0,101.0


Unnamed: 0,date_registration,date_unregistration
count,36666.0,11592.0
mean,-68.496073,73.590666
std,49.179962,73.058458
min,-320.0,-365.0
25%,-99.0,12.0
50%,-54.0,55.0
75%,-29.0,131.0
max,167.0,444.0


code_module                7
code_presentation          4
id_student             27295
final_result               4
date_registration        320
date_unregistration      355
dtype: int64

final_result
Pass           14682
Withdrawn       9904
Fail            8478
Distinction     3624
Name: count, dtype: Int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36688 entries, 0 to 36687
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_module          36688 non-null  string 
 1   code_presentation    36688 non-null  string 
 2   id_student           36688 non-null  string 
 3   final_result         36688 non-null  string 
 4   date_registration    36666 non-null  float64
 5   date_unregistration  11592 non-null  float64
dtypes: float64(2), string(4)
memory usage: 1.7 MB


* I removed students that Withdrew before 11 days before the presentation starts because they whould not have participated in the course.
* It looks like out of the 27,295 students there were 36,388 registrations which means some students where registered more than once. Of all the registrations there where 11,592 unregistrations.
* 


In [6]:
bf_course_df = student_df.loc[student_df['date_unregistration'] <= -11].copy()
display(bf_course_df)
display(bf_course_df.describe())
#display(student_df.loc[student_df['date_registration'].isnull()])
bf_course_df.info()

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration
716,AAA,2014J,2318055,Pass,-56.0,-19.0
987,BBB,2013B,335910,Fail,-184.0,-43.0
1816,BBB,2013B,543356,Pass,-52.0,-23.0
2812,BBB,2013J,393327,Pass,-128.0,-109.0
3271,BBB,2013J,546941,Distinction,-53.0,-30.0
...,...,...,...,...,...,...
35251,GGG,2014B,542562,Pass,-225.0,-212.0
35253,GGG,2014B,542562,Pass,-85.0,-73.0
35258,GGG,2014B,548578,Fail,-89.0,-87.0
35639,GGG,2014B,625903,Pass,-78.0,-18.0


Unnamed: 0,date_registration,date_unregistration
count,246.0,252.0
mean,-117.556911,-66.781746
std,59.090201,50.987812
min,-320.0,-365.0
25%,-148.0,-87.5
50%,-115.0,-53.0
75%,-78.0,-26.0
max,-17.0,-11.0


<class 'pandas.core.frame.DataFrame'>
Index: 252 entries, 716 to 35732
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code_module          252 non-null    string 
 1   code_presentation    252 non-null    string 
 2   id_student           252 non-null    string 
 3   final_result         252 non-null    string 
 4   date_registration    246 non-null    float64
 5   date_unregistration  252 non-null    float64
dtypes: float64(2), string(4)
memory usage: 13.8 KB


## Student Interaction with course material

In [7]:
## need to add the Vle connected to Student Vle
vle_ds = engine.execute(text("""
SELECT cast(vle.id_site as text), vle.code_module, vle.code_presentation, vle.activity_type, vle.week_from, vle.week_to, cast(sVle.id_student as text), sVle.date, SUM(sVLe.sum_click) as sum_click
FROM vle
LEFT JOIN studentVle AS sVle ON vle.id_site = sVle.id_site AND vle.code_presentation = sVle.code_presentation AND vle.code_module = sVle.code_module
GROUP BY vle.code_module, vle.code_presentation, sVle.id_student
""")).fetchall()
vle_df = pd.DataFrame(vle_ds, columns=['id_site', 'code_module', 'code_presentation', 'activity_type', 'week_from', 'week_to', 'id_student', 'date', 'sum_click'])
vle_df = vle_df.astype({'id_site':'string', 'code_module':'string', 'code_presentation':'string', 'activity_type':'string', 'id_student':'string'})

In [8]:
display(vle_df)
display(vle_df.nunique())
vle_df.info()

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to,id_student,date,sum_click
0,546897,AAA,2013J,url,,,,,
1,546614,AAA,2013J,homepage,,,11391,-5.0,934.0
2,546614,AAA,2013J,homepage,,,28400,-10.0,1435.0
3,546614,AAA,2013J,homepage,,,30268,-10.0,281.0
4,546614,AAA,2013J,homepage,,,31604,-10.0,2158.0
...,...,...,...,...,...,...,...,...,...
29240,897051,GGG,2014J,resource,2.0,2.0,2640965,17.0,41.0
29241,896962,GGG,2014J,oucontent,,,2645731,110.0,893.0
29242,896956,GGG,2014J,quiz,,,2648187,149.0,312.0
29243,897051,GGG,2014J,resource,2.0,2.0,2679821,-6.0,275.0


id_site                623
code_module              7
code_presentation        4
activity_type           15
week_from               24
week_to                 24
id_student           26074
date                   281
sum_click             5380
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29245 entries, 0 to 29244
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id_site            29245 non-null  string 
 1   code_module        29245 non-null  string 
 2   code_presentation  29245 non-null  string 
 3   activity_type      29245 non-null  string 
 4   week_from          4705 non-null   float64
 5   week_to            4705 non-null   float64
 6   id_student         29228 non-null  string 
 7   date               29228 non-null  float64
 8   sum_click          29228 non-null  float64
dtypes: float64(4), string(5)
memory usage: 2.0 MB


## Student Assessment

In [9]:
ass_ds = engine.execute(text("""
SELECT cast(sAss.id_student as text), ass.code_module, ass.code_presentation, cast(sAss.id_assessment as text), sAss.date_submitted, ass.date
FROM studentAssessment as sAss
LEFT JOIN assessments as ass ON sAss.id_assessment = ass.id_assessment
""")).fetchall()

assessment_df = pd.DataFrame(ass_ds, columns=['id_student', 'code_module', 'code_presentation', 'id_assessent', 'date_submitted', 'date'])
assessment_df = assessment_df.astype({'id_student':'string', 'code_module':'string', 'code_presentation':'string', 'id_assessent':'string'})
display(assessment_df.describe())
assessment_df.info()

Unnamed: 0,date_submitted,date
count,173912.0,171047.0
mean,116.032942,130.605623
std,71.484148,78.025175
min,-11.0,12.0
25%,51.0,54.0
50%,116.0,129.0
75%,173.0,214.0
max,608.0,261.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id_student         173912 non-null  string 
 1   code_module        173912 non-null  string 
 2   code_presentation  173912 non-null  string 
 3   id_assessent       173912 non-null  string 
 4   date_submitted     173912 non-null  int64  
 5   date               171047 non-null  float64
dtypes: float64(1), int64(1), string(4)
memory usage: 8.0 MB


We are looking at the students that haven't haded in any assessments.

* We don't want them to skew our data as they never participated.
* There could be reasons but for the momst part the students either withdrew or failed.
* over 75% had unregistered by the 27th day of the presetation.
* It could only be assumed that the 2 that passed had prior learning or some other reason

In [10]:
students_assessed = list(assessment_df['id_student'].unique())
all_students = list(student_df['id_student'].unique())
display(f"Number of students assessed/total students: {len(list(students_assessed))}/{len(list(all_students))}")

student_not_assessed_df = student_df[~student_df['id_student'].isin(students_assessed)].copy()
display(student_not_assessed_df.nunique())
display(student_not_assessed_df['final_result'].value_counts())
display(student_not_assessed_df[['date_registration', 'date_unregistration']].describe())


'Number of students assessed/total students: 23369/27295'

code_module               7
code_presentation         4
id_student             3948
final_result              3
date_registration       227
date_unregistration     189
dtype: int64

final_result
Withdrawn    3480
Fail         1334
Pass            2
Name: count, dtype: Int64

Unnamed: 0,date_registration,date_unregistration
count,4801.0,3535.0
mean,-70.581546,22.876945
std,48.268486,42.463059
min,-309.0,-232.0
25%,-104.0,0.0
50%,-58.0,12.0
75%,-30.0,27.0
max,37.0,240.0


## The Students that have been assessed.

In [11]:
students_assessed_df = student_df[student_df['id_student'].isin(students_assessed)]
display(students_assessed_df.nunique())
display(students_assessed_df['final_result'].value_counts())
display(students_assessed_df[['date_registration', 'date_unregistration']].describe())

code_module                7
code_presentation          4
id_student             23347
final_result               4
date_registration        317
date_unregistration      354
dtype: int64

final_result
Pass           14680
Fail            7144
Withdrawn       6424
Distinction     3624
Name: count, dtype: Int64

Unnamed: 0,date_registration,date_unregistration
count,31865.0,8057.0
mean,-68.181861,95.841256
std,49.308932,72.560183
min,-320.0,-365.0
25%,-99.0,38.0
50%,-53.0,95.0
75%,-29.0,154.0
max,167.0,444.0


## The Students that have not been assessed but interacted.

In [12]:

student_not_assessed_interacted_df = vle_df[~vle_df['id_student'].isin(students_assessed)].copy()
display(student_df[student_df['id_student'].isin(list(student_not_assessed_interacted_df['id_student']))].nunique())
display(student_df[student_df['id_student'].isin(list(student_not_assessed_interacted_df['id_student']))]['final_result'].value_counts())

code_module               7
code_presentation         4
id_student             2634
final_result              3
date_registration       205
date_unregistration     183
dtype: int64

final_result
Withdrawn    2337
Fail          998
Pass            1
Name: count, dtype: Int64

## The Students that have been assessed but not interacted.

In [13]:
student_assessed_not_interacted_df = student_df[(student_df['id_student'].isin(students_assessed) & ~student_df['id_student'].isin(list(vle_df['id_student'].unique())))].copy()

display(student_assessed_not_interacted_df)
display(student_assessed_not_interacted_df.nunique())
display(student_df[student_df['id_student'].isin(list(student_assessed_not_interacted_df['id_student']))]['final_result'].value_counts())

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration
1493,BBB,2013B,517853,Fail,-50.0,
2752,BBB,2013J,355591,Withdrawn,-176.0,111.0
2850,BBB,2013J,415698,Withdrawn,-87.0,76.0
2892,BBB,2013J,440751,Withdrawn,-127.0,-8.0
2948,BBB,2013J,481448,Fail,-30.0,
3268,BBB,2013J,546195,Fail,-10.0,
3540,BBB,2013J,574810,Withdrawn,-29.0,48.0
4562,BBB,2013J,2346025,Fail,-71.0,
4640,BBB,2013J,2650236,Fail,-80.0,
4654,BBB,2014B,38941,Fail,-24.0,


code_module             4
code_presentation       4
id_student             23
final_result            3
date_registration      22
date_unregistration     6
dtype: int64

final_result
Fail         15
Withdrawn     6
Pass          2
Name: count, dtype: Int64

In [14]:
ou_df = pd.merge(pd.merge(student_df,assessment_df,how='left'), vle_df, how='left')
display(ou_df)
ou_df.info()

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration,id_assessent,date_submitted,date,id_site,activity_type,week_from,week_to,sum_click
0,AAA,2013J,11391,Pass,-159.0,,1752,18.0,19.0,,,,,
1,AAA,2013J,11391,Pass,-159.0,,1753,53.0,54.0,,,,,
2,AAA,2013J,11391,Pass,-159.0,,1754,115.0,117.0,,,,,
3,AAA,2013J,11391,Pass,-159.0,,1755,164.0,166.0,,,,,
4,AAA,2013J,11391,Pass,-159.0,,1756,212.0,215.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209570,GGG,2014J,2684003,Distinction,-28.0,,37437,169.0,173.0,,,,,
209571,GGG,2014J,2684003,Distinction,-28.0,,37438,73.0,229.0,,,,,
209572,GGG,2014J,2684003,Distinction,-28.0,,37439,150.0,229.0,,,,,
209573,GGG,2014J,2684003,Distinction,-28.0,,37440,172.0,229.0,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209575 entries, 0 to 209574
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   code_module          209575 non-null  string 
 1   code_presentation    209575 non-null  string 
 2   id_student           209575 non-null  string 
 3   final_result         209575 non-null  string 
 4   date_registration    209524 non-null  float64
 5   date_unregistration  29902 non-null   float64
 6   id_assessent         203702 non-null  string 
 7   date_submitted       203702 non-null  float64
 8   date                 199684 non-null  float64
 9   id_site              1923 non-null    string 
 10  activity_type        1923 non-null    string 
 11  week_from            400 non-null     float64
 12  week_to              400 non-null     float64
 13  sum_click            1923 non-null    float64
dtypes: float64(7), string(7)
memory usage: 22.4 MB


In [15]:
## Defining the categories:
categories = pd.Series(['Very Early', 'Early', 'Late'])

## Applying these categories to the final dataframe:
ou_df['registration'] = pd.cut(ou_df.date_registration, bins = [-322,-160,0,160], labels=categories)
ou_df.head()

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration,id_assessent,date_submitted,date,id_site,activity_type,week_from,week_to,sum_click,registration
0,AAA,2013J,11391,Pass,-159.0,,1752,18.0,19.0,,,,,,Early
1,AAA,2013J,11391,Pass,-159.0,,1753,53.0,54.0,,,,,,Early
2,AAA,2013J,11391,Pass,-159.0,,1754,115.0,117.0,,,,,,Early
3,AAA,2013J,11391,Pass,-159.0,,1755,164.0,166.0,,,,,,Early
4,AAA,2013J,11391,Pass,-159.0,,1756,212.0,215.0,,,,,,Early


In [16]:
ou_df = ou_df.replace(['Pass', 'Withdrawn', 'Distinction', 'Fail'], ['1', '0', '1', '0'])
ou_df.head()

Unnamed: 0,code_module,code_presentation,id_student,final_result,date_registration,date_unregistration,id_assessent,date_submitted,date,id_site,activity_type,week_from,week_to,sum_click,registration
0,AAA,2013J,11391,1,-159.0,,1752,18.0,19.0,,,,,,Early
1,AAA,2013J,11391,1,-159.0,,1753,53.0,54.0,,,,,,Early
2,AAA,2013J,11391,1,-159.0,,1754,115.0,117.0,,,,,,Early
3,AAA,2013J,11391,1,-159.0,,1755,164.0,166.0,,,,,,Early
4,AAA,2013J,11391,1,-159.0,,1756,212.0,215.0,,,,,,Early


In [17]:
# find the min and max of the column
print('minimum:', ou_df['sum_click'].min(), end='\n')
print('maximum:', ou_df['sum_click'].max())

## Defining the categories label:
categories3 = pd.Series(['0-2295', '2295-4590', '4590-6885', '6885-19179', '9179-11474', '11474-13769'])

## Applying these categories both to the auxiliary and to the working datasets:
ou_df['total_clicks'] = pd.cut(ou_df.sum_click, bins = [0,2295,4590,6885,9179,11474,13769], labels=categories3)
ou_df['total_clicks'].nunique()

minimum: 3.0
maximum: 13769.0


6

In [18]:

del ou_df['date_registration']
del ou_df['id_student']
del ou_df['code_presentation']
del ou_df['sum_click']
del ou_df['week_to']
del ou_df['date']
del ou_df['date_submitted']
del ou_df['id_assessent']
del ou_df['date_unregistration']


In [19]:
ou_df["code_module"] = ou_df["code_module"].astype("str")
ou_df["final_result"] = ou_df["final_result"].astype("str")
#ou_df["date_unregistration"] = ou_df["date_unregistration"].astype("str")
#ou_df["date"] = ou_df["date"].astype("str")
ou_df["id_site"] = ou_df["id_site"].astype("str")
ou_df["activity_type"] = ou_df["activity_type"].astype("str")
ou_df["registration"] = ou_df["registration"].astype("str")
ou_df["total_clicks"] = ou_df["total_clicks"].astype("str")
display(ou_df.nunique())
display(ou_df['registration'].unique())
ou_df.head()

code_module        7
final_result       2
id_site          165
activity_type     11
week_from         16
registration       4
total_clicks       7
dtype: int64

array(['Early', 'Very Early', 'Late', 'nan'], dtype=object)

Unnamed: 0,code_module,final_result,id_site,activity_type,week_from,registration,total_clicks
0,AAA,1,,,,Early,
1,AAA,1,,,,Early,
2,AAA,1,,,,Early,
3,AAA,1,,,,Early,
4,AAA,1,,,,Early,


In [49]:

ou_df['id_site'] = ou_df['id_site'].replace({'<NA>': '546712'})
ou_df['activity_type'] = ou_df['activity_type'].replace({'<NA>': 'url'})
ou_df['week_from'] = ou_df['week_from'].fillna(1)
ou_df['activity_type'].unique()

array(['url', 'oucontent', 'resource', 'forumng', 'quiz', 'subpage',
       'homepage', 'oucollaborate', 'externalquiz', 'ouelluminate'],
      dtype=object)

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [40]:
# get the locations
X = ou_df.iloc[:, :]
y = ou_df.iloc[:, :]
 
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0)

In [41]:
X_train.describe(include=['O'])

Unnamed: 0,code_module,final_result,id_site,activity_type,registration,total_clicks
count,146702,146702,146702,146702,146702,146702.0
unique,7,2,145,10,4,7.0
top,FFF,1,546712,url,Early,
freq,44586,108038,145364,145510,139614,145363.0


In [42]:
combine = [X_train, X_test]

In [43]:
module_converting = {'AAA':1, 'BBB':2, 'CCC':3, 'DDD':4, 'EEE':5, 'FFF':6, 'GGG':7}
for dataset in combine:
    dataset['code_module'] = dataset['code_module'].map(module_converting)
    dataset['code_module'] = dataset['code_module'].fillna(2)

X_train.head()

Unnamed: 0,code_module,final_result,id_site,activity_type,week_from,registration,total_clicks
76088,3,1,546712,url,1.0,Early,
183804,6,0,546712,url,1.0,Early,
160319,6,1,546712,url,1.0,Early,
176109,6,0,546712,url,1.0,Early,
30016,2,1,546712,url,1.0,Early,


In [44]:
module_converting = {'0-2295':1, '2295-4590':2, '4590-6885':3, '6885-19179':4, '9179-11474':5, '11474-13769':6}
for dataset in combine:
    dataset['total_clicks'] = dataset['total_clicks'].map(module_converting)
    dataset['total_clicks'] = dataset['total_clicks'].fillna(1)

X_train.head()

Unnamed: 0,code_module,final_result,id_site,activity_type,week_from,registration,total_clicks
76088,3,1,546712,url,1.0,Early,1.0
183804,6,0,546712,url,1.0,Early,1.0
160319,6,1,546712,url,1.0,Early,1.0
176109,6,0,546712,url,1.0,Early,1.0
30016,2,1,546712,url,1.0,Early,1.0


In [50]:

module_converting = {'url':1, 'oucontent':2, 'resource':3, 'forumng':4, 'quiz':5, 'subpage':6,
       'homepage':7, 'oucollaborate':8, 'externalquiz':9, 'ouelluminate':10}
for dataset in combine:
    dataset['activity_type'] = dataset['activity_type'].map(module_converting)
    dataset['activity_type'] = dataset['activity_type'].fillna(1)

X_train.head()

Unnamed: 0,code_module,final_result,id_site,activity_type,week_from,registration,total_clicks
76088,3,1,546712,1,1.0,2.0,1.0
183804,6,0,546712,1,1.0,2.0,1.0
160319,6,1,546712,1,1.0,2.0,1.0
176109,6,0,546712,1,1.0,2.0,1.0
30016,2,1,546712,1,1.0,2.0,1.0


In [45]:
module_converting = {'Very Early':1, 'Early':2, 'Late':3}
for dataset in combine:
    dataset['registration'] = dataset['registration'].map(module_converting)
    dataset['registration'] = dataset['registration'].fillna(2)

X_train.head()

Unnamed: 0,code_module,final_result,id_site,activity_type,week_from,registration,total_clicks
76088,3,1,546712,url,1.0,2.0,1.0
183804,6,0,546712,url,1.0,2.0,1.0
160319,6,1,546712,url,1.0,2.0,1.0
176109,6,0,546712,url,1.0,2.0,1.0
30016,2,1,546712,url,1.0,2.0,1.0


In [52]:
X_train_r = X_train.drop("final_result", axis=1)
Y_train_r = X_train["final_result"]
X_test_r = X_test.drop("final_result", axis=1).copy()
X_train_r.shape, Y_train_r.shape, X_test_r.shape

((146702, 6), (146702,), (62873, 6))

In [63]:
coeff_df = pd.DataFrame(X_train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,id_site,1.865198e-06
0,final_result,1.782264e-11
4,registration,6.756796e-12
3,week_from,3.779408e-12
5,total_clicks,3.478537e-12
2,activity_type,3.113488e-12


In [64]:
logreg = LogisticRegression()
logreg.fit(X_train_r, Y_train_r)
Y_pred = logreg.predict(X_test_r)
acc_log = round(logreg.score(X_train_r, Y_train_r) * 100, 2)
acc_log

73.64

In [65]:
svc = SVC()
svc.fit(X_train_r, Y_train_r)
Y_pred = svc.predict(X_test_r)
acc_svc = round(svc.score(X_train_r, Y_train_r) * 100, 2)
acc_svc

73.64

In [54]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_r, Y_train_r)
Y_pred = decision_tree.predict(X_test_r)
acc_decision_tree = round(decision_tree.score(X_train_r, Y_train_r) * 100, 2)
acc_decision_tree

73.86

In [55]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(146702, 7)

In [56]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_r, Y_train_r)
Y_pred = knn.predict(X_test_r)
acc_knn = round(knn.score(X_train_r, Y_train_r) * 100, 2)
acc_knn

56.71

In [57]:
gaussian = GaussianNB()
gaussian.fit(X_train_r, Y_train_r)
Y_pred = gaussian.predict(X_test_r)
acc_gaussian = round(gaussian.score(X_train_r, Y_train_r) * 100, 2)
acc_gaussian

73.65

In [58]:
linear_svc = LinearSVC()
linear_svc.fit(X_train_r, Y_train_r)
Y_pred = linear_svc.predict(X_test_r)
acc_linear_svc = round(linear_svc.score(X_train_r, Y_train_r) * 100, 2)
acc_linear_svc



73.64

In [59]:
sgd = SGDClassifier()
sgd.fit(X_train_r, Y_train_r)
Y_pred = sgd.predict(X_test_r)
acc_sgd = round(sgd.score(X_train_r, Y_train_r) * 100, 2)
acc_sgd

26.36

In [60]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_r, Y_train_r)
Y_pred = decision_tree.predict(X_test_r)
acc_decision_tree = round(decision_tree.score(X_train_r, Y_train_r) * 100, 2)
acc_decision_tree

73.86

In [61]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train_r, Y_train_r)
Y_pred = random_forest.predict(X_test_r)
random_forest.score(X_train_r, Y_train_r)
acc_random_forest = round(random_forest.score(X_train_r, Y_train_r) * 100, 2)
acc_random_forest

73.86

In [66]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,73.86
7,Decision Tree,73.86
4,Naive Bayes,73.65
0,Support Vector Machines,73.64
2,Logistic Regression,73.64
6,Linear SVC,73.64
1,KNN,56.71
5,Stochastic Gradient Decent,26.36
