### fastText Trainning

In [None]:
import fasttext as ft
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
epoch = 30
dim = 100
actions_txt_path = r".\data\actions_txt"
Edudata = r'.\data\EduData_20221028'
model_path = r".\model"
course = "ALL-2020"

In [None]:
# training fastText model
model = ft.train_unsupervised(actions_txt_path + r"\actions_ALL-2020.txt",  model='skipgram',minCount=1, dim=dim, epoch=epoch)
model.save_model(model_path+r"\fastText_train{}_{}dim_{}epoch.bin".format(course,dim,epoch))

In [None]:
# model that is used in our paper
model = ft.load_model(r".\model\fasttext_ALL-2020_paperused.bin")

# Section 5

### Table6

In [None]:
model.get_nearest_neighbors("Nm")

### Table7

In [None]:
model.get_nearest_neighbors("NNNNsNmNsNsPl")

### Figure 5 & 6

In [None]:
words = ft.load_model(r"C:\Users\miyazakiyuma\code\python\EDM_src\model\forFig5_6_wordsD-2022.bin").words

In [None]:
word_df = pd.DataFrame()
for word in words:
    w_vec = model.get_word_vector(word)
    w_s = pd.DataFrame(np.array(w_vec).reshape(1,100))
    word_df = pd.concat([word_df,w_s])
word_df["word"] = words
word_df = word_df.set_index("word")

In [None]:
word_df

In [None]:
sim_wrods = cosine_similarity(word_df)

In [None]:
# NNNNsNmNsNsPl (a sample of words in fastText Training data)
nm_sim = sim_wrods[1]
nm_sim
hist, bins = np.histogram(nm_sim,bins=20)
plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.hist(nm_sim)

In [None]:
# NNNNsNmNsNsPl (a sample of words not in fastText Training data)
nm_sim = sim_wrods[-1]
nm_sim
hist, bins = np.histogram(nm_sim,bins=20)
plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.hist(nm_sim)

### generate ALL-2020's action vectors (for making Codebook)

In [None]:
course = "ALL-2020"
actions_file = actions_txt_path + r"\actions_{}.txt".format(course)
vecs = []
actions = []
with open(actions_file, "r") as f:
    t_actions = f.readlines()
    for t_action in t_actions:
        t_action = t_action.rstrip("\n") 
        vec = model.get_sentence_vector(t_action)
        vecs.append(vec)
        actions.append(t_action)

In [None]:
vecs

In [None]:
df = pd.DataFrame(vecs, index=actions)
df

In [None]:
df.to_csv(r".\data\code_book\for_CodeBook_{}.csv".format(course))

### for comparsion: action text file from EventStream A-2020 and D-2020

In [None]:
course = "D-2020" # A-2020
model = ft.load_model(r".\model\fasttext_{}_paperused.bin".format(course))
actions_file = actions_txt_path + r"\actions_{}.txt".format(course)
vecs = []
actions = []
with open(actions_file, "r") as f:
    t_actions = f.readlines()
    for t_action in t_actions:
        t_action = t_action.rstrip("\n") 
        vec = model.get_sentence_vector(t_action)
        vecs.append(vec)
        actions.append(t_action)

In [None]:
df = pd.DataFrame(vecs, index=actions)
df

In [None]:
df.to_csv(r".\data\code_book\for_CodeBook_{}.csv".format(course))