# tfidf vectorize
TFIDF Vectorizing Scratch projects encoded in syntax-based language (using `scratch-textify`).

### settings and setup

In [5]:
# Executed by system command line
!pip install -q scikit-learn



In [6]:
import os
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

print(pd.__version__)

0.25.1


In [59]:
DATASET = "./dataset"
TRAIN_TARGET = os.path.abspath(os.path.join(DATASET, 'train')) # add the .txt later.

MODEL = "./model"
MODEL_TARGET = os.path.abspath(os.path.join(MODEL, 'tfidf_vectorization')) # add the .bin later.

NUM_SAMPLES = 1000

### prepare dataset for vectorization

In [60]:
train_target = TRAIN_TARGET + "_" + str(NUM_SAMPLES) + ".txt"
model_target = MODEL_TARGET + "_" + str(NUM_SAMPLES) + ".bin"

In [61]:
# read in the projects text file into df
def to_df(filepath, columns=[]):
    df = pd.read_csv(filepath, sep="\n", header=None)
    df.columns = columns
    return df

projects_df = to_df(train_target, columns=['project_text'])
print(projects_df.shape)
display(projects_df.sample(n=5))

ids_df = to_df(train_ids_target, columns=['project_id'])
print(ids_df.shape)
display(ids_df.sample(n=5))

# ids_df = ids_df.set_index('project_id')

(1000, 1)
                                          project_text
668  _STARTSTACK_ event_whenflagclicked _NEXT_ look...
228  _STARTSTACK_ event_whenflagclicked _NEXT_ look...
144  _STARTSTACK_ event_whenflagclicked _NEXT_ look...
569  _STARTSTACK_ event_whenflagclicked _NEXT_ cont...
534  _STARTSTACK_ event_whenflagclicked _NEXT_ moti...


### vectorize
Vectorize the (textified) dataset using sklearn's tfidfvectorizer. The end product is a single vector for  the entire project.

In [62]:
# tokenizer = lambda text: text_to_word_sequence(text, filters='', lower=True, split=",")
vectorizer = TfidfVectorizer(lowercase=False)
tfidf_matrix = vectorizer.fit_transform(df['project_text'])

In [63]:
features = vectorizer.get_feature_names()
print(features)
print("\n")
print("number of features: ", len(features))
print(type(tfidf_matrix))
print(tfidf_matrix.shape)

['_BOOLARG_', '_ENDINPUT_', '_ENDNEST_', '_ENDSTACK_', '_LIST_', '_MENU_', '_NEXT_', '_NUMTEXTARG_', '_STARTINPUT_', '_STARTNEST_', '_STARTSTACK_', '_VAR_', 'control_create_clone_of', 'control_delete_this_clone', 'control_forever', 'control_if', 'control_if_else', 'control_repeat', 'control_repeat_until', 'control_start_as_clone', 'control_stop', 'control_wait', 'control_wait_until', 'data_addtolist', 'data_changevariableby', 'data_deletealloflist', 'data_deleteoflist', 'data_hidelist', 'data_hidevariable', 'data_insertatlist', 'data_itemnumoflist', 'data_itemoflist', 'data_lengthoflist', 'data_listcontainsitem', 'data_replaceitemoflist', 'data_setvariableto', 'data_showlist', 'data_showvariable', 'event_broadcast', 'event_broadcastandwait', 'event_whenbackdropswitchesto', 'event_whenbroadcastreceived', 'event_whenflagclicked', 'event_whengreaterthan', 'event_whenkeypressed', 'event_whenthisspriteclicked', 'looks_backdropnumbername', 'looks_changeeffectby', 'looks_changesizeby', 'looks

In [64]:
# make it into vector with columns being the feature names
tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
tfidf_df.columns = features
print(tfidf_df.sample(n=5))
print('\n')
print(tfidf_df.shape)

     _BOOLARG_  _ENDINPUT_  _ENDNEST_  _ENDSTACK_  _LIST_    _MENU_    _NEXT_  \
954        0.0    0.386938   0.100675    0.120986     0.0  0.355156  0.323274   
628        0.0    0.437191   0.170624    0.136698     0.0  0.000000  0.342430   
255        0.0    0.357278   0.083662    0.134054     0.0  0.442708  0.268645   
491        0.0    0.471015   0.097319    0.077969     0.0  0.400538  0.286458   
806        0.0    0.382848   0.000000    0.179560     0.0  0.000000  0.269879   

     _NUMTEXTARG_  _STARTINPUT_  _STARTNEST_  ...  sound_seteffectto  \
954           0.0      0.386938     0.100675  ...                0.0   
628           0.0      0.437191     0.170624  ...                0.0   
255           0.0      0.357278     0.083662  ...                0.0   
491           0.0      0.471015     0.097319  ...                0.0   
806           0.0      0.382848     0.000000  ...                0.0   

     sound_setvolumeto  sound_stopallsounds  sound_volume  \
954                

In [8]:
# save the vectorizer into disk (using pickle)?


-----

## evaluate

### nearest neighbors

In [65]:
nearest_neighbors = NearestNeighbors(metric="cosine")
nearest_neighbors.fit(tfidf_df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [66]:
k = 5
index = np.random.randint(0, tfidf_df.shape[0])
print("index: ", index)

index:  353


In [67]:
# choose a random project and find its nearest k neighbors
starting_vector = np.array([tfidf_df.iloc[index]])
print(starting_vector)

[[0.         0.46646538 0.00574892 0.05527016 0.         0.22308913
  0.37843479 0.         0.46646538 0.00574892 0.05527016 0.
  0.         0.         0.         0.0090913  0.         0.
  0.         0.         0.         0.18305679 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.05093286 0.         0.         0.08015697
  0.01989804 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.0546944
  0.         0.         0.         0.29367656 0.         0.
  0.02709232 0.         0.03117604 0.         0.04576315 0.
  0.03645538 0.11380898 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.4638717  0.
  0.         0.  

In [73]:
nearest_distances, nearest_vector_indices = nearest_neighbors.kneighbors(starting_vector, k)
print("nearest neighbors: ", list(zip(nearest_vector_indices.flatten().tolist(), nearest_distances.flatten().tolist())))


nearest neighbors:  [(353, 2.220446049250313e-16), (942, 0.012359907496883205), (602, 0.013966959018432812), (263, 0.015051012506195671), (117, 0.01689257182796844)]


In [74]:
# print the starting vector
print("============== STARTING VECTOR:\n")
print(projects_df.iloc[index]['project_text'])
print("\n")

print("============== NEAREST NEIGHBORS:\n")
# print out the project text for those nearest neighbors
nearest_vector_indices = nearest_vector_indices.flatten().tolist()
for i in nearest_vector_indices:
    if i == index:
        # this is just the starting vector's index
        continue
    print(projects_df.iloc[i]['project_text'])
    print("\n")


_STARTSTACK_ event_whenbroadcastreceived _MENU_ menu_option _MENU_ _NEXT_ looks_switchbackdropto _MENU_ menu_option _MENU_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ looks_switchbackdropto _MENU_ menu_option _MENU_ _ENDSTACK_ _STARTSTACK_ event_whenbroadcastreceived _MENU_ menu_option _MENU_ _NEXT_ looks_switchbackdropto _MENU_ menu_option _MENU_ _ENDSTACK_ _STARTSTACK_ event_whenbroadcastreceived _MENU_ menu_option _MENU_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ looks_show _NEXT_ looks_switchcostumeto _MENU_ menu_option _MENU_ _NEXT_ looks_sayforsecs _STARTINPUT_ numtext_input _ENDINPUT_ _STARTINPUT_ numtext_input _ENDINPUT_ _NEXT_ control_wait _STARTINPUT_ numtext_input _ENDINPUT_ _NEXT_ looks_sayforsecs _STARTINPUT_ numtext_input _ENDINPUT_ _STARTINPUT_ numtext_input _ENDINPUT_ _NEXT_ control_wait _STARTINPUT_ numtext_input _ENDINPUT_ _NEXT_ looks_switchcostumeto _MENU_ menu_option _MENU_ _NEXT_ control_wait _STARTINPUT_ numtext_input _ENDINPUT_ _NEXT_ looks_s

In [None]:
# (manually) extract project ids for those nearest neighbors.
