# fasttext vectorize
FASTTEXT Vectorizing Scratch projects encoded in syntax-based language (using `scratch-textify`).

### settings and setup

In [10]:
# Executed by system command line
# !pwd
!pip install -q fasttext
!pip install -q gensim
!pip install -q scikit-learn



In [11]:
import os
import warnings

import fasttext
import gensim
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

print(pd.__version__)

0.25.1


In [14]:
DATASET = "./dataset"
TRAIN_TARGET = os.path.abspath(os.path.join(DATASET, 'train')) # add the .txt later.

MODEL = "./model"
MODEL_TARGET = os.path.abspath(os.path.join(MODEL, 'vectorization')) # add the .bin later.

NUM_SAMPLES = 1000

### vectorize
Vectorize the dataset using fasttext. The end product are "embeddings" for blocks / symbols.

In [50]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

In [51]:
model = fasttext.train_unsupervised(train_target,
                                    model = "skipgram",
                                    minCount = 5,
                                    wordNgrams=1,
                                    dim=128,
                                    minn=3,
                                    maxn=6,
                                    epoch = 5,
                                    lr = 0.05)

In [52]:
model.save_model(model_target)

### load model
Load the model into gensim for better evaluation below.

In [53]:
gensim_model = gensim.models.fasttext.load_facebook_vectors(model_target)

-----

## evaluate

## word embeddings

### nearest neighbors

In [54]:
gensim_model.wv.most_similar(positive=["motion_turnright"])

[('motion_turnleft', 0.8829981684684753),
 ('motion_movesteps', 0.6902536153793335),
 ('motion_pointindirection', 0.6546069383621216),
 ('motion_ifonedgebounce', 0.6008567214012146),
 ('motion_pointtowards', 0.5628019571304321),
 ('motion_direction', 0.5596492290496826),
 ('looks_changesizeby', 0.5229341387748718),
 ('motion_glideto', 0.5154552459716797),
 ('motion_setrotationstyle', 0.48891136050224304),
 ('pen_changePenHueBy', 0.4830555319786072)]

## project embeddings
Combine word embeddings for all words in the project's text by averaging them. Resulting vector is the project embedding.

In [55]:
# read in the projects text file into df
def csv_to_df(filepath, columns=[]):
    df = pd.read_csv(filepath, sep="\n", header=None)
    df.columns = columns
#     df = df.set_index('project_text')
    return df

df = csv_to_df(train_target, columns=['project_text'])
print(df.shape)
display(df.sample(n=5))

(1000, 1)


Unnamed: 0,project_text
542,_STARTSTACK_ event_whenthisspriteclicked _NEXT...
492,_STARTSTACK_ event_whenkeypressed _MENU_ menu_...
649,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...
568,_STARTSTACK_ event_whenthisspriteclicked _NEXT...
116,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...


In [92]:
# apply to each row: the split function on the project text on space (" ")
tokens_df = df.apply(lambda row: row['project_text'].split(" "), axis=1)
tokens_df = pd.DataFrame(tokens_df)
tokens_df.columns = ['project_tokens']

In [100]:
print("Number of tokens: ", len(tokens_df.iloc[100]['project_tokens']))

Number of tokens:  1051


In [94]:
def get_batch_word_embeddings(project_tokens):
    # this function gets all word embeddings for tokens in a single project.
    # apply the get_word_vector to each token in project.
    return list(map(model.get_word_vector, project_tokens))
    
# map the get_word_vector function from fasttext to each token from split list above
embeddings_df = tokens_df.apply(lambda row: get_batch_word_embeddings(row['project_tokens']), axis=1)
embeddings_df = pd.DataFrame(embeddings_df)
embeddings_df.columns = ['embeddings']

In [95]:
embeddings_df = embeddings_df.apply(lambda row: np.array(row['embeddings']), axis=1)
embeddings_df = pd.DataFrame(embeddings_df)
embeddings_df.columns = ['embeddings']
display(embeddings_df)

Unnamed: 0,embeddings
0,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
1,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
2,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
3,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
4,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
...,...
995,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
996,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
997,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."
998,"[[0.21450824, -0.045012362, -0.13869916, 0.273..."


In [101]:
print(embeddings_df.iloc[100]['embeddings'].shape)

(1051, 128)


In [102]:
# take the mean of those vectors to get the project embedding
project_embeddings = embeddings_df.apply(lambda row: np.mean(row['embeddings'], axis=0), axis=1)
project_embeddings = pd.DataFrame(project_embeddings)
project_embeddings.columns = ['embedding']
display(project_embeddings)

Unnamed: 0,embedding
0,"[0.023995442, 0.022763226, 0.0007133481, 0.036..."
1,"[-0.016639246, -0.0019163504, -0.0150304735, -..."
2,"[-0.05557948, -0.018262913, -0.03116151, -0.00..."
3,"[0.04061277, 0.031710193, -0.006383933, 0.0316..."
4,"[0.047298573, -0.009121757, -0.043928746, 0.06..."
...,...
995,"[-0.030901695, -0.03857236, -0.032687176, 0.02..."
996,"[0.06905353, 0.053826157, -0.0106923, 0.042574..."
997,"[0.06646415, -0.020444186, 0.011007728, 0.0525..."
998,"[0.08105424, 0.0006001053, -0.015349248, 0.066..."


In [105]:
print(project_embeddings.iloc[0]['embedding'].shape)

(128,)


### nearest neighbors