# vectorize
Vectorizing Scratch projects encoded in syntax-based language (using `scratch-textify`), using `fasttext` and `tf-idf`. Output is **word embeddings** (not project embeddings).

### settings and setup

In [3]:
# Executed by system command line
!pip install -q fasttext
!pip install -q gensim
!pip install -q scikit-learn



In [4]:
import os
import warnings

import fasttext
import gensim
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

print(pd.__version__)

0.25.1


In [5]:
DATASET = "./dataset"
TRAIN_TARGET = os.path.abspath(os.path.join(DATASET, 'train')) # add the .txt later.

MODEL = "./model"
MODEL_TARGET = os.path.abspath(os.path.join(MODEL, 'vectorization')) # add the .bin later.


### train on 1000 samples
Vectorize the dataset using fasttext. The end product are "embeddings" for blocks / symbols.

In [6]:
NUM_SAMPLES = 1000

In [7]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
train_ids_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".ids"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

print(train_target)
print(train_ids_target)
print(model_target)

/home/jovyan/shared/lena/dataset/train_1000.txt
/home/jovyan/shared/lena/dataset/train_1000.ids
/home/jovyan/shared/lena/model/vectorization_1000.bin


In [8]:
model = fasttext.train_unsupervised(train_target,
                                    model = "skipgram",
                                    minCount = 5,
                                    dim=128,         # number of dimensions
                                    minn=3,          # minimum size of subword n-grams
                                    maxn=6,          # maximum size of subword n-grams
                                    epoch = 5,       # number of training epochs
                                    lr = 0.05)       # learning rate

In [9]:
model.save_model(model_target)

### train on 10,000 samples
Vectorize the dataset using fasttext. The end product are "embeddings" for blocks / symbols.

In [10]:
NUM_SAMPLES = 10000

In [11]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
train_ids_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".ids"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

print(train_target)
print(train_ids_target)
print(model_target)

/home/jovyan/shared/lena/dataset/train_10000.txt
/home/jovyan/shared/lena/dataset/train_10000.ids
/home/jovyan/shared/lena/model/vectorization_10000.bin


In [12]:
model = fasttext.train_unsupervised(train_target,
                                    model = "skipgram",
                                    minCount = 5,
                                    dim=128,         # number of dimensions
                                    minn=3,          # minimum size of subword n-grams
                                    maxn=6,          # maximum size of subword n-grams
                                    epoch = 5,       # number of training epochs
                                    lr = 0.05)       # learning rate

In [13]:
model.save_model(model_target)

### train on 100,000 samples
Vectorize the dataset using fasttext. The end product are "embeddings" for blocks / symbols.

In [14]:
NUM_SAMPLES = 100000

In [16]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
train_ids_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".ids"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

print(train_target)
print(train_ids_target)
print(model_target)

/home/jovyan/shared/lena/dataset/train_100000.txt
/home/jovyan/shared/lena/dataset/train_100000.ids
/home/jovyan/shared/lena/model/vectorization_100000.bin


In [17]:
model = fasttext.train_unsupervised(train_target,
                                    model = "skipgram",
                                    minCount = 5,
                                    dim=128,         # number of dimensions
                                    minn=3,          # minimum size of subword n-grams
                                    maxn=6,          # maximum size of subword n-grams
                                    epoch = 5,       # number of training epochs
                                    lr = 0.05)       # learning rate

In [18]:
model.save_model(model_target)

### train on 500,000 samples
Vectorize the dataset using fasttext. The end product are "embeddings" for blocks / symbols.

In [22]:
NUM_SAMPLES = 500000

In [23]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
train_ids_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".ids"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

print(train_target)
print(train_ids_target)
print(model_target)

/home/jovyan/shared/lena/dataset/train_500000.txt
/home/jovyan/shared/lena/dataset/train_500000.ids
/home/jovyan/shared/lena/model/vectorization_500000.bin


In [24]:
model = fasttext.train_unsupervised(train_target,
                                    model = "skipgram",
                                    minCount = 5,
                                    dim=128,         # number of dimensions
                                    minn=3,          # minimum size of subword n-grams
                                    maxn=6,          # maximum size of subword n-grams
                                    epoch = 5,       # number of training epochs
                                    lr = 0.05)       # learning rate

In [25]:
model.save_model(model_target)