In [1]:
import pandas as pd
import math
import logging
import numpy as np

In [2]:
FOLDER_DATA_RAW = "../data_raw/"
FOLDER_DATA_PROCESSED = "../data_processed/"

## Data Processing Functions

This section contains utility functions for processing and cleaning user interaction data, including timestamp conversion, session duration calculation, outlier filtering, and feature engineering for course recommendation tasks.

#### Timestamp Processing

Convert the 'timestamp' column to datetime format and handle missing or invalid timestamps.

In [3]:
def timestamp_tranform(df):
    ## Transform column "timestamp" to date type
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    ## Review if some row does not have timestamp
    print("========= Null timestamp ==============")
    print(df[df['timestamp'].isnull()])
    print("=======================================")
    df.dropna(subset=['timestamp'],inplace=True)
    return df

#### Calculate Session Duration

Functions to compute the duration of user sessions and handle missing or negative values.

In [4]:
def get_duration(x, user_data):
    try:
        if x.name != 0 and x.name < user_data.shape[0]:
            last_log = user_data.iloc[(x.name - 1)]
            if last_log["id_session"] == x["id_session"]:
                return int(pd.Timedelta(x["timestamp"] - last_log["timestamp"]).total_seconds() / 60)
            else:
                return -5
        else:
            return -5
    except:
        logging.debug('error get duration lesson: {}'.format(x["lesson_name"]))




In [5]:
def replace_negative_duration(x, dataframe_lessons_duration):
    try:
        if x.duration == -5:
            return dataframe_lessons_duration[dataframe_lessons_duration["lesson_name"] == x["lesson_name"]]["duration"].values[0]
        else:
            return x.duration
    except:
        logging.debug('error replace negative duration: {}'.format(x["lesson_name"]))

In [6]:
def process_duration(user_data):

    ## Load data set of lesson duration 
    courses_avg_duration_df = pd.read_csv(f"{FOLDER_DATA_RAW}courses_avg_duration.csv",encoding_errors='ignore')
    courses_avg_duration_df["duration"] = courses_avg_duration_df.groupby("lesson_name")["duration"].transform('mean')
    courses_avg_duration_df["duration"] = courses_avg_duration_df["duration"]/60
    courses_avg_duration_df.drop_duplicates(subset=["lesson_name"], inplace=True)
    print("=== describe mean general lesson duration =====")
    print(courses_avg_duration_df["duration"].describe())
    print("=======================================")
    ## Apply functions to get duration (return -5 if can't get duration)
    user_data = user_data.sort_values(by=["id_session","timestamp"]).reset_index(drop=True)
    user_data["duration"] = user_data.apply(lambda x:get_duration(x,user_data), axis=1)

    print("== describe mean lesson duration ==")
    print(user_data[user_data["duration"] >= 0]["duration"].describe())
    print("===========================================")

    user_data["duration"] = user_data.apply(lambda x:replace_negative_duration(x,courses_avg_duration_df), axis=1)

    print("== describe mean lesson duration (without -5) ==")
    print(user_data["duration"].describe())
    print("=======================================")

    user_data["duration_intra"] = user_data.groupby(["id_session", "lesson_name"])["duration"].transform('sum')

    print("== describe intra-lesson duration ==")
    print(user_data["duration"].describe())
    print("=======================================")

    user_data["duration_inter"] = user_data.groupby(["user_id", "lesson_name"])["duration"].transform('sum')

    print("== describe inter-lesson duration ==")
    print(user_data["duration_inter"].describe())
    print("=======================================")

    return user_data


#### Filter Data (Remove Outliers)

Remove users with too few or too many courses viewed to reduce noise and outliers in the dataset.

In [7]:
def filter_users(df):
    df["courses_viewed"] = df.groupby("user_id")["course_name"].transform('nunique')
    df = df[df["courses_viewed"] > 2]
    df = df[df["courses_viewed"] < 50]
    return df

#### Add Total Lessons per Course

Merge the dataset with course information to include the total number of lessons for each course.

In [8]:
def add_lessons_by_course(df):
    courses_info = pd.read_csv(f"{FOLDER_DATA_RAW}lessons_by_course.csv", sep=",")
    print("==== Mean duration by course ==========")
    print(courses_info["lessons_number"].describe())
    print("=======================================")
    final_data_set = pd.merge(df, courses_info, how='left', left_on='course_name', right_on='path_course').drop(columns = ['path_course'])
    final_data_set = final_data_set.dropna(subset=['lessons_number'])
    return final_data_set

#### Calculate Course Data per User

Compute features such as the number of lessons viewed and course completion percentage for each user.

In [9]:
def get_data_courses_by_user(df):
    df["lessons_viewed"] = df.groupby(["user_id","course_name"])["course_name"].transform('nunique')
    df["course_porcentage"] = df["lessons_viewed"] / df["lessons_number"]
    df["course_porcentage_mean"] = df.drop_duplicates(subset=["user_id", "course_name"]).groupby("user_id")["course_porcentage"].transform('mean')
    df["course_porcentage_mean_global"] = df.drop_duplicates(subset=["user_id", "course_name"]).groupby("course_name")["course_porcentage"].transform('mean')
    return df

#### Split Data into Train and Test Sets

Functions to split the processed data into training and testing sets for model evaluation.

In [10]:
#Number the courses for a specific user
def create_index(df, user_id):
    df_courses_index=df.loc[df['user_id']== user_id].reset_index().reset_index()
    df_courses_index["position"]=df_courses_index["level_0"]
    df_courses_index=df_courses_index.drop(columns=["level_0", "index"])
    df_courses_index["position"]=df_courses_index["position"]+1
    
    return df_courses_index

In [11]:
#This function classifies between train and test
def clasificator(index , courses_viewed):
    return "test" if index>(math.floor(courses_viewed *0.7)) else "train"

In [12]:
def split_data_sets(df):
    frames_train=[]
    frames_test=[]

    #Iterate through each user, classifying rows as train or test
    for i in df["user_id"].unique():
        data_user=create_index(df, i)
        data_user["clasification"]=data_user.apply(lambda x:clasificator(x["position"], x["courses_viewed"]), axis =1)

        frames_train.append(data_user[data_user["clasification"]=="train"])
        frames_test.append(data_user[data_user["clasification"]=="test"])

    df_train=pd.concat(frames_train)
    df_test=pd.concat(frames_test)

    return df_train, df_test


## Data Processing Pipeline

This section executes the full data processing pipeline, including loading, cleaning, feature engineering, and splitting the data for further analysis or modeling.

In [13]:
logging.basicConfig(filename="data_processing.log",
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

In [14]:
user_data = pd.read_csv(f"{FOLDER_DATA_RAW}raw_information_user.csv", sep=",", usecols=["user_id", "course_name","id_session" ,"lesson_name","timestamp"],low_memory=False)
user_data = timestamp_tranform(user_data)
user_data = process_duration(user_data)
user_data.dropna(subset=['lesson_name'],inplace=True)

                    user_id  id_session course_name lesson_name timestamp
544911  10174768.1627523178  1627523178          nv          tc       NaT
544912  10174768.1627523178  1627523178          nv          tc       NaT
=== describe mean general lesson duration =====
count    2527.000000
mean        4.266185
std         6.356358
min         0.004167
25%         1.078199
50%         2.513435
75%         5.495256
max       133.765785
Name: duration, dtype: float64
== describe mean lesson duration ==
count    397003.000000
mean          2.781679
std           5.408902
min           0.000000
25%           0.000000
50%           0.000000
75%           3.000000
max          97.000000
Name: duration, dtype: float64
== describe mean lesson duration (without -5) ==
count    608271.000000
mean          4.389959
std           6.358481
min           0.000000
25%           0.000000
50%           1.837006
75%           6.892560
max         133.765785
Name: duration, dtype: float64
== describe intr

In [15]:
### Extra step to delete courses that are not included in the CB system, reduce original size reported in the paper
course_list = np.load(f"{FOLDER_DATA_PROCESSED}listcourses.npy")
user_data = user_data[user_data["course_name"].isin(course_list)]

In [16]:
user_data = add_lessons_by_course(user_data)
user_data = get_data_courses_by_user(user_data)
user_data = filter_users(user_data)

count    291.000000
mean      11.563574
std        9.782739
min        0.000000
25%        4.000000
50%        8.000000
75%       18.000000
max       39.000000
Name: lessons_number, dtype: float64


In [17]:
print("========= Users number ===========")
print(user_data["user_id"].nunique())
print("=================================")
user_data.to_csv(f'{FOLDER_DATA_PROCESSED}final_data_set.csv')

user_data_reduce = user_data.drop_duplicates(subset=["user_id", "course_name"])

df_train, df_test = split_data_sets(user_data_reduce)
df_test.to_csv(f'{FOLDER_DATA_PROCESSED}dataTest.csv')
df_train.to_csv(f'{FOLDER_DATA_PROCESSED}dataTrain.csv')

6863


In [18]:
course_list = np.load(f"{FOLDER_DATA_PROCESSED}listcourses.npy")
data = pd.read_csv(f"{FOLDER_DATA_PROCESSED}final_data_set.csv", sep=",", low_memory=False)

In [19]:
len(course_list)

214

In [20]:
data["course_name"].nunique()

195

In [21]:
data["id_session"].nunique()

55453