In [190]:
# Libraries

# EDA 
import numpy as np
import  pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# -Pre-Processing 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# -Model Selection
from sklearn.model_selection import train_test_split


# -Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Model validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve 

#Hyper-parameter Hyper-parameter
from sklearn.model_selection import GridSearchCV

# -Metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

%matplotlib inline

In [191]:

#Import of very relevant data 

course_ratings = pd.read_csv('365_database/365_course_ratings.csv')

student_info = pd.read_csv('365_database/365_student_info.csv')

student_learning = pd.read_csv('365_database/365_student_learning.csv')

student_purchases = pd.read_csv('365_database/365_student_purchases.csv')

# Less relevant data

#course_info = pd.read_csv('365_database/365_course_info.csv')

#exam_info = pd.read_csv('365_database/365_exam_info.csv')

#quiz_info = pd.read_csv('365_database/365_quiz_info.csv')

#student_engagement = pd.read_csv('365_database/365_student_engagement.csv')

#student_exams = pd.read_csv('365_database/365_student_exams.csv')

#student_hub_questions = pd.read_csv('365_database/365_student_hub_questions.csv')

#student_quizzes =pd.read_csv('365_database/365_student_quizzes.csv')


# Work process :

<ol>
    <li>
        Dataset preparation and preprocessing
        <ul>
            <li>Data collection</li>
            <li>Data visualization</li>
            <li>Data selection</li>
            <li>Dataset splitting</li>
            <li>Data Cleaning</li>
            <li>Data formatting</li>
            <li>Data Scaling</li>
        </ul>
    </li>
    <li>Modeling and Validation</li>
    <li>Evaluating the model using  test set</li>
</ol>

# Dataset preparation and preprocessing

## Data Collection

In [192]:
# A glance at the various data

In [193]:
course_ratings.head()

Unnamed: 0,course_id,student_id,course_rating,date_rated
0,14,258956,5,2022-07-06
1,7,259019,5,2022-08-30
2,23,259019,4,2022-08-30
3,14,259283,5,2022-07-08
4,30,259283,5,2022-07-02


In [194]:
student_info.head()

Unnamed: 0,student_id,student_country,date_registered
0,258798,IN,2022-01-01
1,258799,CO,2022-01-01
2,258800,CA,2022-01-01
3,258801,IN,2022-01-01
4,258802,US,2022-01-01


In [195]:
student_learning.head()

Unnamed: 0,student_id,course_id,minutes_watched,date_watched
0,258798,23,0.3,2022-01-01
1,258800,2,12.9,2022-04-01
2,258800,7,46.8,2022-01-03
3,258800,7,4.3,2022-01-04
4,258800,7,31.7,2022-01-07


In [196]:
student_purchases.head()

Unnamed: 0,purchase_id,student_id,purchase_type,date_purchased
0,15781,258800,Annual,2022-01-01
1,15786,258803,Annual,2022-01-01
2,15808,258862,Annual,2022-01-01
3,15809,258865,Annual,2022-01-01
4,15811,258878,Annual,2022-01-01


In [197]:
# Merge differents data to build a data set 

In [279]:
df = pd.merge(student_learning, student_purchases, how='left', on='student_id').merge(student_info, how='left', on='student_id').merge(course_ratings, how='left', on=['student_id', 'course_id'])

In [199]:
df.head()

Unnamed: 0,student_id,course_id,minutes_watched,date_watched,purchase_id,purchase_type,date_purchased,student_country,date_registered,course_rating,date_rated
0,258798,23,0.3,2022-01-01,,,,IN,2022-01-01,,
1,258800,2,12.9,2022-04-01,15781.0,Annual,2022-01-01,CA,2022-01-01,,
2,258800,7,46.8,2022-01-03,15781.0,Annual,2022-01-01,CA,2022-01-01,,
3,258800,7,4.3,2022-01-04,15781.0,Annual,2022-01-01,CA,2022-01-01,,
4,258800,7,31.7,2022-01-07,15781.0,Annual,2022-01-01,CA,2022-01-01,,


In [280]:
#Converte date_registered, date_watched,date_purchased to date time
cols = ['date_registered', 'date_watched', 'date_purchased']
for i in cols:
    df[i] = pd.to_datetime(df[i])

In [281]:
df['purchase'] = df['purchase_type']

df.drop('purchase_type', axis=1, inplace=True)

dic = {'Annual':1, 'Monthly':1, 'Quarterly': 1, np.nan: 0}

df['purchase'] = df['purchase'].map(dic)

df['purchase'] = df['purchase'].astype('int')

In [282]:
df.head(10)

Unnamed: 0,student_id,course_id,minutes_watched,date_watched,purchase_id,date_purchased,student_country,date_registered,course_rating,date_rated,purchase
0,258798,23,0.3,2022-01-01,,NaT,IN,2022-01-01,,,0
1,258800,2,12.9,2022-04-01,15781.0,2022-01-01,CA,2022-01-01,,,1
2,258800,7,46.8,2022-01-03,15781.0,2022-01-01,CA,2022-01-01,,,1
3,258800,7,4.3,2022-01-04,15781.0,2022-01-01,CA,2022-01-01,,,1
4,258800,7,31.7,2022-01-07,15781.0,2022-01-01,CA,2022-01-01,,,1
5,258800,7,61.6,2022-01-05,15781.0,2022-01-01,CA,2022-01-01,,,1
6,258800,7,45.1,2022-01-06,15781.0,2022-01-01,CA,2022-01-01,,,1
7,258800,11,0.5,2022-02-21,15781.0,2022-01-01,CA,2022-01-01,,,1
8,258800,11,1.1,2022-02-22,15781.0,2022-01-01,CA,2022-01-01,,,1
9,258800,14,4.0,2022-01-28,15781.0,2022-01-01,CA,2022-01-01,,,1


In [203]:
df.shape

(87633, 11)

In [204]:
# the dataset has 87633 sample, not bad 😋 and 12 columns

In [205]:
df.isnull().sum()

student_id             0
course_id              0
minutes_watched        0
date_watched           0
purchase_id        27204
date_purchased     27204
student_country       44
date_registered        0
course_rating      71054
date_rated         71054
purchase               0
dtype: int64

In [206]:
# A lot of missing values, this is normal due to the merge 😐

In [207]:
df.dtypes

student_id                  int64
course_id                   int64
minutes_watched           float64
date_watched       datetime64[ns]
purchase_id               float64
date_purchased     datetime64[ns]
student_country            object
date_registered    datetime64[ns]
course_rating             float64
date_rated                 object
purchase                    int32
dtype: object

##### Remarks

In [208]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
student_id,87633.0,272453.644597,9934.82175,258798.0,263690.0,270486.0,280019.0,295513.0
course_id,87633.0,19.587941,12.274264,2.0,7.0,16.0,24.0,67.0
minutes_watched,87633.0,31.387779,41.174484,0.0,5.4,18.9,42.2,1710.6
purchase_id,60429.0,19469.821112,2054.504091,15781.0,17677.0,19504.0,21243.0,23152.0
course_rating,16579.0,4.795766,0.520219,1.0,5.0,5.0,5.0,5.0
purchase,87633.0,0.689569,0.462673,0.0,0.0,1.0,1.0,1.0


##### Remarks

- Minutes watched 
    *50% of students watched videos of more than 10 min
- Rating Course
    *Most of the students give, This is normal, since the quality of teaching is excellent

## Data Visualisation

In [276]:
df['student_country'].value_counts()

pandas.core.series.Series

## Data selection

In [286]:
data = df.copy()
data = df.drop(['purchase_id', 'student_id', 'date_purchased'], axis=1)

In [287]:
data.head()

Unnamed: 0,course_id,minutes_watched,date_watched,student_country,date_registered,course_rating,date_rated,purchase
0,23,0.3,2022-01-01,IN,2022-01-01,,,0
1,2,12.9,2022-04-01,CA,2022-01-01,,,1
2,7,46.8,2022-01-03,CA,2022-01-01,,,1
3,7,4.3,2022-01-04,CA,2022-01-01,,,1
4,7,31.7,2022-01-07,CA,2022-01-01,,,1


In [288]:
data.isna().sum()

course_id              0
minutes_watched        0
date_watched           0
student_country       44
date_registered        0
course_rating      71054
date_rated         71054
purchase               0
dtype: int64

## Data split

In [291]:
X = data.drop('purchase', axis=1)
y = data[['purchase']]

In [293]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [294]:
#check distributions

print(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

purchase
1           0.68957
0           0.31043
dtype: float64 purchase
1           0.689565
0           0.310435
dtype: float64


## Cleaning 

1 . Inpute Missing Values
- Inpute student_country by most frequent
- Inputer date_rated by most il same in date_registered
- Inpute course_rating by most freqent 

In [290]:
#

inputer_i = SimpleImputer(strategy='most_frequent')
cols_freq = ['course_rating', 'student_country']

#

data['course_rating'].fillna(data['date_registered'])

In [289]:
for col in

0        2022-01-01 00:00:00
1        2022-01-01 00:00:00
2        2022-01-01 00:00:00
3        2022-01-01 00:00:00
4        2022-01-01 00:00:00
                ...         
87628    2022-10-20 00:00:00
87629    2022-10-20 00:00:00
87630    2022-10-20 00:00:00
87631    2022-10-20 00:00:00
87632    2022-10-20 00:00:00
Name: course_rating, Length: 87633, dtype: object

In [None]:
inputer.fit_transform(data[])