## Generating Student Vectors (Embedding + Aggregation)

this notebook generate students vector in "course" with CodeBook k (k:the number of centroids, 100 is default)
".\data\vectors\norm_Student_Vctors_course{course}_{k}dim.csv"

In [None]:
import pandas as pd
import numpy as np
import fasttext as ft
import re
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

select courses and k (the number of centroids). k correspond to the dimensions of student vectors. 

In [None]:
# course_list = [A-2021,A-2022,D-2021,D-2022]
course= "A-2022"
k=100

In [None]:
Code_Book_dir = r".\data\code_book"
CodeBook_file = Code_Book_dir + r"\CodeBook_k{}.csv".format(k)

### prepareing for Generating Student vectors in "course"

In [None]:
# actions :  All actions in one course 
# user_df :  userid and actions of this student

actions = r".\data2\actions_txt\actions_{}_perStudents.txt".format(course)
users_actions = []
user_action = ""
user=[]
with open(actions, "r") as f:
    actions_list = f.readlines()
    for action in actions_list:
        if action.startswith("****"):
            user.append(action.rstrip("\n").replace("*",""))
            users_actions.append(user_action)
            user_action = ""
        else:
            user_action += action + " "
user_df = pd.DataFrame(users_actions)
user_df["userid"] = user
user_df.set_index("userid",inplace=True)

In [None]:
user_df

### Aggregation

In [None]:
## read CodeBook

action_centroids = pd.read_csv(CodeBook_file,index_col=0)
## load fastText model (we must use the same model to Making CodeBook)
model = ft.load_model(r".\model\fastText_trainALL-2020_100dim_30epoch.bin")

### Aggregation step
# users_vecs: output students vector 
users_vecs =[]
# users actions
users_data = user_df.values
# aggregation step for one student
for actions_one_user in users_data:
        # user_vec = one student aggregated vector 
        user_vec = [0*i for i in range(len(action_centroids))]
        # split each actions 
        actions_oneuser = actions_one_user[0].split("\n") 
        for one_action in actions_oneuser:
            # if empty action, skip vectorization
            if one_action == "" or one_action == " ":
                    continue
            # vector of action generated by fastText
            # embedding
            one_action_vec = model.get_sentence_vector(one_action)
            # calculate cosine similarity between action vec and centroids
            sim_mat = cosine_similarity(one_action_vec.reshape(1,-1), action_centroids.values)
            # get the most similar centroids ID
            max_index = np.argmax(sim_mat)
            # count the most similar centroids 
            user_vec[max_index] += 1
        users_vecs.append(user_vec)
boa_df = pd.DataFrame(users_vecs,index=user_df.index)

#normarization
norm_boa_df = pd.DataFrame(normalize(boa_df,norm="l2",axis=1),index=boa_df.index)
norm_boa_df.to_csv(r".\data2\vectors\norm_Student_Vctors_course{}_{}dim.csv".format(course,k))

In [None]:
norm_boa_df

## To comparsion, fasttext trained by A-2020, and D-2020. k = 100.

### A-2020

In [None]:
## read CodeBook
if k == 100:
        CodeBook_file = Code_Book_dir + r"\CodeBook_k{}_A20.csv".format(k)
        action_centroids = pd.read_csv(CodeBook_file,index_col=0)
        ## load fastText model (we must use the same model to Making CodeBook)
        model = ft.load_model(r".\model\fastText_trainA-2020_100dim_30epoch.bin")

        ### Aggregation step
        # users_vecs: output students vector 
        users_vecs =[]
        # users actions
        users_data = user_df.values
        # aggregation step for one student
        for actions_one_user in users_data:
                # user_vec = one student aggregated vector 
                user_vec = [0*i for i in range(len(action_centroids))]
                # split each actions 
                actions_oneuser = actions_one_user[0].split("\n") 
                for one_action in actions_oneuser:
                # if empty action, skip vectorization
                        if one_action == "" or one_action == " ":
                                continue
                        # vector of action generated by fastText
                        # embedding
                        one_action_vec = model.get_sentence_vector(one_action)
                        # calculate cosine similarity between action vec and centroids
                        sim_mat = cosine_similarity(one_action_vec.reshape(1,-1), action_centroids.values)
                        # get the most similar centroids ID
                        max_index = np.argmax(sim_mat)
                        # count the most similar centroids 
                        user_vec[max_index] += 1
                users_vecs.append(user_vec)
        boa_df = pd.DataFrame(users_vecs,index=user_df.index)

        #normarization
        norm_boa_df = pd.DataFrame(normalize(boa_df,norm="l2",axis=1),index=boa_df.index)
        norm_boa_df.to_csv(r".\data2\vectors\norm_Student_Vctors_course{}_{}dim_A20.csv".format(course,k))

In [None]:
norm_boa_df

### D-2020

In [None]:
## read CodeBook
if k==100:
        CodeBook_file = Code_Book_dir + r"\CodeBook_k{}_D20.csv".format(k)
        action_centroids = pd.read_csv(CodeBook_file,index_col=0)
        ## load fastText model (we must use the same model to Making CodeBook)
        model = ft.load_model(r".\model\fastText_trainD-2020_100dim_30epoch.bin")

        ### Aggregation step
        # users_vecs: output students vector 
        users_vecs =[]
        # users actions
        users_data = user_df.values
        # aggregation step for one student
        for actions_one_user in users_data:
                # user_vec = one student aggregated vector 
                user_vec = [0*i for i in range(len(action_centroids))]
                # split each actions 
                actions_oneuser = actions_one_user[0].split("\n") 
                for one_action in actions_oneuser:
                # if empty action, skip vectorization
                        if one_action == "" or one_action == " ":
                                continue
                        # vector of action generated by fastText
                        # embedding
                        one_action_vec = model.get_sentence_vector(one_action)
                        # calculate cosine similarity between action vec and centroids
                        sim_mat = cosine_similarity(one_action_vec.reshape(1,-1), action_centroids.values)
                        # get the most similar centroids ID
                        max_index = np.argmax(sim_mat)
                        # count the most similar centroids 
                        user_vec[max_index] += 1
                users_vecs.append(user_vec)
        boa_df = pd.DataFrame(users_vecs,index=user_df.index)

        #normarization
        norm_boa_df = pd.DataFrame(normalize(boa_df,norm="l2",axis=1),index=boa_df.index)
        norm_boa_df.to_csv(r".\data2\vectors\norm_Student_Vctors_course{}_{}dim_D20.csv".format(course,k))

In [None]:
norm_boa_df