## fastText Trainning

In [None]:
import fasttext as ft
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
epoch = 30
dim = 100
actions_txt_path = r".\data\actions_txt"
Edudata = r'.\data\EduData_20221028'
model_path = r".\model"
course = "ALL-2020"

In [None]:
import random
random.seed(42)

In [None]:
# training fastText model
##model = ft.train_unsupervised(actions_txt_path + r"\actions_{}.txt".format(course),  model='skipgram',minCount=1, dim=dim, epoch=epoch)
##model.save_model(model_path+r"\fastText_train{}_{}dim_{}epoch.bin".format(course,dim,epoch))

In [None]:
# model that is used in our paper
#model = ft.load_model(r".\model\fasttext_ALL-2020_paperused.bin")
model = ft.load_model(model_path+r"\fastText_train{}_{}dim_{}epoch.bin".format(course,dim,epoch))

In [None]:
print(len(model.words))

In [None]:
model.words

## Section 5

### Table6

In [None]:
model.get_nearest_neighbors("Nm")

### Table7

In [None]:
model.get_nearest_neighbors("NNNNsNmNsNsPl")

### Figure 3

In [None]:
# load all units of D-2022 
units = ft.load_model(r".\model\fastText_trainD-2022_100dim_30epoch.bin").words

In [None]:
# make DataFrame index = unit, values = unit vector  
unit_df = pd.DataFrame()
for unit in units:
    u_vec = model.get_word_vector(unit)
    u_s = pd.DataFrame(np.array(u_vec).reshape(1,100))
    unit_df = pd.concat([unit_df,u_s])
unit_df["word"] = units
unit_df = unit_df.set_index("word")

In [None]:
unit_df

In [None]:
# sim_units : cosine similarity matrix of units
sim_units = cosine_similarity(unit_df)
#print(sim_units.shape)
sim_units

In [None]:
# Nm (a sample of words in fastText Training data)
nm_sim = sim_units[1]
print(unit_df.index[1])

plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.hist(nm_sim)

In [None]:
# NNNNsNmNsNsPl (a sample of words not in fastText Training data)
nm_sim = sim_units[-1]
nm_sim
print(unit_df.index[-1])

plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.hist(nm_sim)

In [None]:
bins = np.histogram_bin_edges(np.concatenate((sim_units[1], sim_units[-1])), bins=20)

# draw histgram
n1, _ = np.histogram(sim_units[1], bins=bins)
n2, _ = np.histogram(sim_units[-1], bins=bins)

plt.hist(bins[:-1],bins, weights=n1, alpha=0.5, label="Nm")
plt.hist(bins[:-1], bins, weights=n2, alpha=0.5, label="NNNsNmNsNsPl")
plt.legend()
max_height = max(max(n1), max(n2))
plt.ylim(0, max_height)
plt.xlabel('Cosine Similarity')
plt.ylabel('The number of units')
plt.show()

## generate ALL-2020's action vectors (for making Codebook)

In [None]:
course = "ALL-2020"
actions_file = actions_txt_path + r"\actions_{}.txt".format(course)
vecs = []
actions = []
with open(actions_file, "r") as f:
    # lies of text is action 
    t_actions = f.readlines()
    for t_action in t_actions:
            t_action = t_action.rstrip("\n") 
            # generate an action vector
            vec = model.get_sentence_vector(t_action)
            ##print(t_action, np.linalg.norm(vec))
            vecs.append(vec)
            actions.append(t_action)
len(actions)

In [None]:
# A-2020 と D-2020を結合したときに生じた''を削除
# Delete zero vector generated from '' in textfile.
# '' is generated by concatination step.
df = pd.DataFrame(vecs, index=actions)
df = df[~(df.index=='')]

In [None]:
# save all action vectors in ALL-2020
df.to_csv(r".\data2\code_book\for_CodeBook_{}.csv".format(course))

## for comparsion: action text file from EventStream A-2020 and D-2020

In [None]:
course = "D-2020" # A-2020
# load fastText model trained by course data
model = ft.load_model(r".\model\fastText_train{}_100dim_30epoch.bin".format(course))
actions_file = actions_txt_path + r"\actions_{}.txt".format(course)
vecs = []
actions = []
with open(actions_file, "r") as f:
    t_actions = f.readlines()
    for t_action in t_actions:
        t_action = t_action.rstrip("\n") 
        vec = model.get_sentence_vector(t_action)
        vecs.append(vec)
        actions.append(t_action)

In [None]:
df = pd.DataFrame(vecs, index=actions)
df

In [None]:
df.to_csv(r".\data2\code_book\for_CodeBook_{}.csv".format(course))