## Preprocessing

In [None]:
import OpenLA as la
import sys
import os
import re
import numpy as np
import datetime as dt
import pandas as pd

In [None]:
actions_txt_path = r".\data\actions_txt"
Edudata = r'.\data\EduData_20221028'
courses = ["A-2020","D-2020","A-2021","D-2021","A-2022","D-2022"]

In [None]:
Operation_dict = {
        'NEXT':'N',
        'PREV':'P',
        'ADD MARKER':'A',

        'OPEN':'O',             
        'CLOSE':'C',            
        'PAGE_JUMP':'J',        
        'GETIT':'G',            

        'DELETE MARKER':'E',
        'BOOKMARK_JUMP':'E',
        'ADD BOOKMARK':'E',
        'NOTGETIT':'E',
        'ADD MEMO':'E',
        'MEMO_TEXT_CHANGE_HISTORY':'E',

        'DELETE BOOKMARK':'E',
        'CHANGE MEMO':'E',
        'SEARCH_JUMP':'E',
        'REGIST CONTENTS':'E',
        'DELETE_MEMO':'E',
        'SEARCH':'E',
        'OPEN_RECOMMENDATION':'E',
        'CLICK_RECOMMENDATION':'E',
        'TIMER_PAUSE':'E',
        'TIMER_STOP':'E',
        'ADD_HW_MEMO':'E',

        'CLOSE_RECOMMENDATION':'E',
        'CLEAR_HW_MEMO':'E',
        'LINK_CLICK':'E',
        'UNDO_HW_MEMO':'E',
        'ADD_RECOMMENDATION' : 'E',
        'REDO_HW_MEMO' : 'E',
        'DELETE_RECOMMENDATION' : 'E',
        'MEMO_JUMP' : 'E'
        }


def get_learninglog_sentences(eventstream, userid):
    user_eventstream_df = get_user_eventstream(eventstream,userid)
    return get_oneuser_sentences(user_eventstream_df)

def get_user_eventstream(eventstream, userid):
    user_stream = la.select_user(eventstream, userid)
    user_stream_df = user_stream.df.sort_values(["contentsid", "eventtime"])
    user_stream_df = user_stream_df[["contentsid", "operationname","eventtime"]]
    df = user_stream_df.replace(Operation_dict)
    df.index = np.arange(0, len(df))  
    return   df

def get_oneuser_sentences(user_eventstream_df,word_max_len=15)->str:
        """
        user_eventstreamからsentenceを生成する。
        改行条件:
        1. contentsidが変化した場合
        2. interval_wordがlの場合(5分以上オペレーション間の時間が空いた場合)
        単語の分割条件:
        1. 単語の長さがword_max_len以上になった場合
        2. 単語の先頭から1分以上たった場合
        params:
        user_event_stream_df : ユーザーごとのイベントストリーム
        word_max_len : 単語の最大長さ。これ以上になったら、強制的に分割する
    
        return:
        sentences : user_event_streamをsentencesに変換したもの
        """
    
        sentences = "" #返り値用の変数
        word = "" #単語作成用の一時変数
        user_eventstream_df = user_eventstream_df.reset_index(drop=True) #indexを0始まりに直す
        for index, data in user_eventstream_df.iterrows():
            current = dt.datetime.strptime((data["eventtime"]), '%Y-%m-%d %H:%M:%S') #word_timeによる条件の作動のための変数 
            current_contents_id = data["contentsid"] #現在のコンテンツID

            if index == 0:#初期化処理
                previous = current 
                previous_contents_id = current_contents_id #最初はpreviousも同じにする
                start = current
                end = start + dt.timedelta(minutes=1)

            if (previous_contents_id != current_contents_id):#コンテンツがIDが変わった場合
                sentences += word + "\n"
                word = ""
                current = dt.datetime.strptime(data["eventtime"], '%Y-%m-%d %H:%M:%S')
                start = current
                end = start + dt.timedelta(minutes=1)
            else:   
                #前のログからの時間を計算して、該当文字を追加
                current_contents_id = data["contentsid"]
                current = dt.datetime.strptime(data["eventtime"], '%Y-%m-%d %H:%M:%S')
                interval_sec = current - previous
                interval_sec = interval_sec.seconds
                interval_word = interval_check(interval_sec)
                word += interval_word

                if interval_word == "l":#長時間ログが空いた場合の処理
                    sentences += word + "\n"
                    word = ""
                    current = dt.datetime.strptime(data["eventtime"], '%Y-%m-%d %H:%M:%S')
                    start = current
                    end = start + dt.timedelta(minutes=1)
                
                if len(word) >= word_max_len - 1: #最大単語数による分割のチェック
                    sentences += word + "_ "
                    word = ""
                    current = dt.datetime.strptime(data["eventtime"], '%Y-%m-%d %H:%M:%S')
                    start = current
                    end = start + dt.timedelta(minutes=1)

                
                if current > end: # wordの先頭から一定時間以上のログ(新しい単語の先頭になるログ)
                    sentences += word + " "
                    word = ""
                    current = dt.datetime.strptime(data["eventtime"], '%Y-%m-%d %H:%M:%S')
                    start = current
                    end = start + dt.timedelta(minutes=1)

            #wordへの単語の追加
            word += data["operationname"]

            #各種変数の更新処理
            previous = current
            previous_contents_id = current_contents_id

        else: #最後はwordを追加
            sentences += word

        if sentences != "":
            sentences += "\n"
        
        return sentences 
    
def interval_check(interval_sec:int):
    if interval_sec <= 1:
      interval_word = ''
    elif interval_sec > 1 and interval_sec <= 10:
      interval_word = 's'
    elif interval_sec > 10 and interval_sec <= 300:
      interval_word = 'm'    
    else:
      interval_word = 'l'
    return interval_word

  # ファイル書き出し
def write_sentences(file_path, actions,usersid,train_flg=0):
    i = 0
    f = open(file_path, 'w')
    for action in actions:
      if action != None:
        f.write(action)
        if train_flg ==0:
            f.write('****{}****\n'.format(usersid[i]))
        i+=1
    f.close()

### EventStream to Sentences

In [None]:
def get_Actions_train(course_id):
    print(course_id)
    actions_file = actions_txt_path + r"\actions_{}.txt".format(course_id)
    # 指定のコースのEventStream を取得
    course_info = la.CourseInformation(files_dir=Edudata, course_id=course_id)
    eventstream = course_info.load_eventstream()
    #print(eventstream)
    # get students' user id in selected course
    usersid = course_info.user_id()
    #print(usersid)
    # get actions from student activity in selected course
    actions=[get_learninglog_sentences(eventstream,userid) for userid in usersid]
    # save file
    write_sentences(actions_file, actions,usersid,train_flg=1)

In [None]:
for course in courses:
    get_Actions_train(course)

make ALL-2020 actions textfile

In [None]:
def find_files(dir, pattern):
    matched_files = []
    regex = re.compile(pattern)
    for root, dirs, files in os.walk(dir):
        for file in files:
            if regex.match(file):
                matched_files.append(os.path.join(root, file))
    return matched_files

def concat_files(files,year):
    with open(actions_txt_path + r"\actions_ALL-{}.txt".format(year), "w") as new_file:
        for name in files:
            with open(name) as f:
                for line in f:
                    new_file.write(line)           
                new_file.write("\n")

In [None]:
year = "2020"
pattern = r'actions_[A-Z]-{}.txt'.format(year)
matched_fiels = find_files(actions_txt_path, pattern)
concat_files(matched_fiels,year)

### actions to making vector (split by each user) 

In [None]:
def get_Actions_Students(course_id):
    actions_file = actions_txt_path + r"\actions_{}_perStudents.txt".format(course_id)
    # 指定のコースのEventStream を取得
    course_info = la.CourseInformation(files_dir=Edudata, course_id=course_id)
    eventstream = course_info.load_eventstream()
    # get students' user id in selected course
    usersid = eventstream.user_id()
    # get actions from student activity in selected course
    actions=[get_learninglog_sentences(eventstream, userid) for userid in usersid]
    # save file
    write_sentences(actions_file, actions,usersid)

In [None]:
for course in courses:
    get_Actions_Students(course)

### histgram of the number of each student actions in A-2022 and D-2022

In [None]:
import matplotlib.pyplot as plt

In [None]:
columns = ["userid", "num_units", "num_actions"]

In [None]:
for course_id in ["A-2022", "D-2022"]:
    data_list = []
    actions_file = actions_txt_path + r"\actions_{}_perStudents.txt".format(course_id)
    with open(actions_file,"r") as f:
        num_word = 0
        num_sentences = 0
        for line in f:
            if line.startswith("****"):
                userid = line.strip()
                userid = userid.strip("****")
                data_list.append([userid,num_word,num_sentences])
                num_word = 0
                num_sentences = 0
            else:
                num_word += len(line.strip().split(' '))
                num_sentences +=1
    df = pd.DataFrame(data_list,columns=columns)
    df = df.set_index("userid")
    df.to_csv(actions_txt_path + r'\{}_words_sentences_count.csv'.format(course_id))

In [None]:
df_a = pd.read_csv(actions_txt_path + r'\{}_words_sentences_count.csv'.format("A-2022"))
df_d = pd.read_csv(actions_txt_path + r'\{}_words_sentences_count.csv'.format("D-2022"))

In [None]:
data1 = df_a["num_actions"]
data2 = df_d["num_actions"]

In [None]:
bins = np.histogram_bin_edges(np.concatenate((data1, data2)), bins=30)

# draw histgram
n1, _ = np.histogram(data1, bins=bins)
n2, _ = np.histogram(data2, bins=bins)

plt.hist(bins[:-1],bins, weights=n1, alpha=0.5, label="A-2022")
plt.hist(bins[:-1], bins, weights=n2, alpha=0.5, label="D-2022")
plt.legend()
max_height = max(max(n1), max(n2))
plt.ylim(0, max_height)
plt.xlabel('The number of actions')
plt.ylabel('The number of users')
plt.show()