### settings and setup

In [121]:
import os
import pandas as pd
import numpy as np
import subprocess

from sklearn.model_selection import train_test_split

In [122]:
CONFIDENCE_THRESHOLD = 0.7
DATASET = "./dataset"
TEXTIFY_BATCHES_TARGET = os.path.abspath(os.path.join(DATASET, 'textified_batches'))
DATASET_TARGET = os.path.abspath(os.path.join(DATASET, 'dataset.csv'))
PROJECTS_TARGET = os.path.abspath(os.path.join(DATASET, 'project_ids.csv'))
TEXTIFY_TARGET = os.path.abspath("../../scratch-textify")
TEXTIFY_INDEX_TARGET = os.path.abspath(os.path.join(TEXTIFY_TARGET, 'index-pool.js'))
COMBINE_BATCHES_TARGET = os.path.abspath(os.path.join(TEXTIFY_TARGET, 'combine_data_files.js'))
RECOMMEND_TARGET = os.path.abspath("..")
CLASSIFICATION_TRAIN_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_train.txt'))
CLASSIFICATION_TEST_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_test.txt'))
CLASSIFICATION_TRAIN_TARGET_NO_LABELS = os.path.abspath(os.path.join(DATASET, 'classification_train_no_labels.txt'))
CLASSIFICATION_TEST_TARGET_NO_LABELS = os.path.abspath(os.path.join(DATASET, 'classification_test_no_labels.txt'))
BATCH_SIZE = 100
TEST_PERCENTAGE = 0.2
GAME_LABEL = "__label__game"
ANIMATION_LABEL = "__label_animation"
OTHER_LABEL = "__label_other"

print("scratch-textify path:", TEXTIFY_TARGET)
print("scratch-recommend path:", RECOMMEND_TARGET)
print("\n")
print("textified_batches path:", TEXTIFY_BATCHES_TARGET)
print("index-pool.js path:", TEXTIFY_INDEX_TARGET)
print("project_ids.csv path:", PROJECTS_TARGET)
print("dataset.csv path:", DATASET_TARGET)


scratch-textify path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-textify
scratch-recommend path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend


textified_batches path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/textified_batches
index-pool.js path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-textify/index-pool.js
project_ids.csv path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/project_ids.csv
dataset.csv path: /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/dataset.csv


### TODO: re-process the dataset in the google sheet because some projects were duplicated (seems like a golden / testcase issue where they are being put as additional rows). Maybe "golden" isn't the right thing to be looking at!

### read dataset into df
`dataset.csv` is the "report" that Figure Eight generated from the annotated projects. It contains information such as: unit_state, number of trusted judgements, label, confidence, etc.

In [3]:
# read the dataset into dataframe
df = pd.read_csv(DATASET_TARGET)
display(df)

Unnamed: 0,unit_state,num_trusted_judgments,label,confidence,id,project_description,project_instructions,project_title,project_url,what_category_best_describes_this_project_gold
0,finalized,3,Other,1.0000,119423992,Afraid-Of-Spiders,i dont roleplay usually in studios X3 just in ...,[THE LAND OF GEMS] RP SIGN-UP remix,https://scratch.mit.edu/projects/119423992,
1,finalized,3,Animation,1.0000,115208879,Credit to @Chewzers !\n\nI just had to do this...,,Baku Baku Nya Nya | MEME | Chat Noir,https://scratch.mit.edu/projects/115208879,
2,finalized,3,Game,1.0000,106301190,,,"Undertale Battle: ""The Creator""",https://scratch.mit.edu/projects/106301190,
3,finalized,3,Game,1.0000,119969509,Note: This isn't a game based off Cookie Click...,Welcome to Tycoon! A game where you make money...,Tycoon V.AlphA6,https://scratch.mit.edu/projects/119969509,
4,finalized,5,Other,0.6019,220271346,"Thanks to Gail Carson Levine's book ""Writing M...",It's the second WBC assignment! I hope the fir...,WBC Assignment #2,https://scratch.mit.edu/projects/220271346,
...,...,...,...,...,...,...,...,...,...,...
1051,golden,6,Game,1.0000,12394514,,Arrow keys: move paddle\nS: Show Achievements ...,bounce and points,https://scratch.mit.edu/projects/12394514,Game\nOther\nCould not answer the question for...
1052,golden,6,Animation,1.0000,108630780,little of the shaking part by @MrMister100\nHo...,Press Press Press the green flag!\nMY First ev...,When I shake my head!,https://scratch.mit.edu/projects/108630780,Animation\nOther\nCould not answer the questio...
1053,golden,5,Animation,1.0000,236477301,...,my first actual animation xp\nnew oc\nCrystal III,The Time Machine | Crystal III,https://scratch.mit.edu/projects/236477301,Animation\nSlideshow\nOther\nCould not answer ...
1054,golden,7,Game,1.0000,142103269,,two player game use wasd and arrows,POKEBATTLE!,https://scratch.mit.edu/projects/142103269,Game\nOther\nCould not answer the question for...


### assign final labels

In [4]:
def assign_final_labels(projects, threshold=CONFIDENCE_THRESHOLD):
    """
    For each project, we will assign the final label based on the confidence level:
        - if confidence >= threshold: then the label is taken as is.
        - if confidence < threshold: then the label is changed to Other, 
                                     and the confidence is set to 1.

    inputs:
        - threshold: confidence threshold above which that category is retained
                    (and below which it gets transformed into "Other")
        - projects: projects dataframe.

    returns:
        - new projects dataframe
    """

    def replace_label(row):
        if row['confidence'] < threshold:
            row['label'] = "Other"
            row['confidence'] = 1.0 # change the confidence to 1 that it's Other
        return row

    projects = projects.apply(replace_label, axis=1, result_type='broadcast')
    projects = pd.DataFrame(projects)

    return projects

In [5]:
# assign the final labels based on confidence threshold
df = assign_final_labels(df, threshold=CONFIDENCE_THRESHOLD)
display(df)

Unnamed: 0,unit_state,num_trusted_judgments,label,confidence,id,project_description,project_instructions,project_title,project_url,what_category_best_describes_this_project_gold
0,finalized,3,Other,1,119423992,Afraid-Of-Spiders,i dont roleplay usually in studios X3 just in ...,[THE LAND OF GEMS] RP SIGN-UP remix,https://scratch.mit.edu/projects/119423992,
1,finalized,3,Animation,1,115208879,Credit to @Chewzers !\n\nI just had to do this...,,Baku Baku Nya Nya | MEME | Chat Noir,https://scratch.mit.edu/projects/115208879,
2,finalized,3,Game,1,106301190,,,"Undertale Battle: ""The Creator""",https://scratch.mit.edu/projects/106301190,
3,finalized,3,Game,1,119969509,Note: This isn't a game based off Cookie Click...,Welcome to Tycoon! A game where you make money...,Tycoon V.AlphA6,https://scratch.mit.edu/projects/119969509,
4,finalized,5,Other,1,220271346,"Thanks to Gail Carson Levine's book ""Writing M...",It's the second WBC assignment! I hope the fir...,WBC Assignment #2,https://scratch.mit.edu/projects/220271346,
...,...,...,...,...,...,...,...,...,...,...
1051,golden,6,Game,1,12394514,,Arrow keys: move paddle\nS: Show Achievements ...,bounce and points,https://scratch.mit.edu/projects/12394514,Game\nOther\nCould not answer the question for...
1052,golden,6,Animation,1,108630780,little of the shaking part by @MrMister100\nHo...,Press Press Press the green flag!\nMY First ev...,When I shake my head!,https://scratch.mit.edu/projects/108630780,Animation\nOther\nCould not answer the questio...
1053,golden,5,Animation,1,236477301,...,my first actual animation xp\nnew oc\nCrystal III,The Time Machine | Crystal III,https://scratch.mit.edu/projects/236477301,Animation\nSlideshow\nOther\nCould not answer ...
1054,golden,7,Game,1,142103269,,two player game use wasd and arrows,POKEBATTLE!,https://scratch.mit.edu/projects/142103269,Game\nOther\nCould not answer the question for...


In [6]:
# sanity check that there is no confidence less than CONFIDENCE_THRESHOLD anymore.
print(df[ df['confidence'] < CONFIDENCE_THRESHOLD])

Empty DataFrame
Columns: [unit_state, num_trusted_judgments, label, confidence, id, project_description, project_instructions, project_title, project_url, what_category_best_describes_this_project_gold]
Index: []


### lump categories 

In [7]:
def lump_categories(projects, from_label="Slideshow", to_label="Other"):
    """
    Lumps the projects in the "from_label" category into projects in the "to_label" category.

    inputs:
        - projects: projects dataframe.
        - from_label: "from" category.
        - to_label: "to" category.

    returns:
        - new projects dataframe
    """
    def replace_label(row):
        if row['label'] == from_label:
            row['label'] = to_label
        return row

    projects = projects.apply(replace_label, axis=1, result_type='broadcast')
    projects = pd.DataFrame(projects)
    return projects

In [8]:
# lump "Slideshow" projects into "Other" category.
df = lump_categories(df, from_label="Slideshow", to_label="Other")

In [9]:
# sanity check that there are no "Slideshow" projects anymore.
print(df[ df['label'] == "Slideshow"])

Empty DataFrame
Columns: [unit_state, num_trusted_judgments, label, confidence, id, project_description, project_instructions, project_title, project_url, what_category_best_describes_this_project_gold]
Index: []


### export project ids

In [10]:
# write out the project ids for the classification task to a csv file (useful to have)
project_ids = df['id'].tolist()

with open(PROJECTS_TARGET, 'w+') as file:
    [file.write((str(project_id) + "\n")) for project_id in project_ids]

print(PROJECTS_TARGET)

/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/project_ids.csv


### textify projects

In [48]:
command = ["node", TEXTIFY_INDEX_TARGET, "--projects_file", PROJECTS_TARGET, "--batch_size", str(BATCH_SIZE), "--write_target", TEXTIFY_BATCHES_TARGET, "--textify_directory", TEXTIFY_TARGET]
print("command: ", " ".join(command))
print("\n")

command:  node /Users/labdalla/Documents/Scratch MEng/Workspace/supervised-learning/scratch-textify/index-pool.js --projects_file /Users/labdalla/Documents/Scratch MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/project_ids.csv --batch_size 100 --write_target /Users/labdalla/Documents/Scratch MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/textified_batches --textify_directory /Users/labdalla/Documents/Scratch MEng/Workspace/supervised-learning/scratch-textify




In [None]:
code = subprocess.check_call(command)

### combine textified batches into single file

#### combine .ids

In [19]:
extension = ".ids"
CLASSIFICATION_DATASET_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_dataset'))

In [14]:
command = ["node", COMBINE_BATCHES_TARGET, "--extension", extension, "--directory", TEXTIFY_BATCHES_TARGET, "--write_target", CLASSIFICATION_DATASET_TARGET]
print("command: ", " ".join(command))
print("\n")

command:  node /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-textify/combine_data_files.js --extension .ids --directory /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/textified_batches --write_target /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_dataset




In [15]:
code = subprocess.check_call(command)

#### combine .txt

In [20]:
extension = ".txt"
CLASSIFICATION_DATASET_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_dataset'))

In [21]:
command = ["node", COMBINE_BATCHES_TARGET, "--extension", extension, "--directory", TEXTIFY_BATCHES_TARGET, "--write_target", CLASSIFICATION_DATASET_TARGET]
print("command: ", " ".join(command))
print("\n")

command:  node /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-textify/combine_data_files.js --extension .txt --directory /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/textified_batches --write_target /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_dataset




In [22]:
code = subprocess.check_call(command)

#### combine .err

In [23]:
extension = ".err"
CLASSIFICATION_DATASET_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_dataset'))

In [24]:
command = ["node", COMBINE_BATCHES_TARGET, "--extension", extension, "--directory", TEXTIFY_BATCHES_TARGET, "--write_target", CLASSIFICATION_DATASET_TARGET]
print("command: ", " ".join(command))
print("\n")

command:  node /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-textify/combine_data_files.js --extension .err --directory /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/textified_batches --write_target /Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_dataset




In [25]:
code = subprocess.check_call(command)

### add labels

In [123]:
# get the appropriate dataset targets
CLASSIFICATION_IDS_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_dataset.ids'))
CLASSIFICATION_TXT_TARGET = os.path.abspath(os.path.join(DATASET, 'classification_dataset.txt'))
print(CLASSIFICATION_IDS_TARGET)
print(CLASSIFICATION_TXT_TARGET)



/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_dataset.ids
/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_dataset.txt


In [124]:
# read in the combined dataset files into dataframes
ids_df = pd.read_csv(CLASSIFICATION_IDS_TARGET, sep="\n", header=None)
ids_df.columns = ['id']

txt_df = pd.read_csv(CLASSIFICATION_TXT_TARGET, sep="\n", header=None)
txt_df.columns = ['project_text']


In [125]:
NUM_PROJECTS = ids_df.shape[0]

In [126]:
print(ids_df.shape)
print(txt_df.shape)
display(ids_df.sample(n=5))
display(txt_df.sample(n=5))

(1011, 1)
(1011, 1)


Unnamed: 0,id
308,12617990
329,174634032
688,147167467
235,20974745
38,102848870


Unnamed: 0,project_text
974,_STARTSTACK_ event_whenflagclicked _NEXT_ look...
849,_STARTSTACK_ event_whenthisspriteclicked _NEXT...
609,_STARTSTACK_ event_whenflagclicked _NEXT_ look...
823,_STARTSTACK_ event_whenflagclicked _NEXT_ moti...
141,_STARTSTACK_ event_whenbroadcastreceived _MENU...


In [127]:
# join these two dataframes together
projects_text_column = txt_df['project_text'].tolist()
ids_df['project_text'] = projects_text_column

In [128]:
ids_df.sample(n=5)

Unnamed: 0,id,project_text
186,12943384,_STARTSTACK_ procedures_definition _NEXT_ cont...
22,12391492,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...
591,165884570,_STARTSTACK_ event_whenthisspriteclicked _NEXT...
585,108636510,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...
727,201278690,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...


In [129]:
# initialize labels column to OTHER_LABEL
ids_df['label'] = [OTHER_LABEL]*NUM_PROJECTS

In [130]:
# apply the labels using the full df from above
def get_label(row):
    matching_df = df[df['id'] == row['id']] # multiple rows might match! (bug: duplicate projects in FE dataset)
    row['label'] = matching_df.iloc[0]['label'] # TODO: change this when you fix the duplicates bug?
    return row

ids_df = ids_df.apply(get_label, axis=1, result_type='broadcast')
ids_df = pd.DataFrame(ids_df)

In [131]:
ids_df.sample(n=5)

Unnamed: 0,id,project_text,label
829,12789243,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Animation
870,12141553,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Game
750,12513031,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Game
748,12219166,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Game
497,225884070,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Game


### shuffle and split dataset

In [132]:
dataframes = train_test_split(ids_df, shuffle=True, test_size=TEST_PERCENTAGE)

In [133]:
train_df = dataframes[0]
test_df = dataframes[1]

In [134]:
print("train:", train_df.shape)
print("test:", test_df.shape)

train: (808, 3)
test: (203, 3)


In [135]:
train_df[train_df['label'] == "Game"]

Unnamed: 0,id,project_text,label
94,115317599,_STARTSTACK_ event_whenflagclicked _NEXT_ moti...,Game
767,107385342,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Game
137,12487113,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Game
359,122207592,_STARTSTACK_ event_whenkeypressed _MENU_ menu_...,Game
294,12682709,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Game
...,...,...,...
50,213423941,_STARTSTACK_ event_whenflagclicked _NEXT_ even...,Game
865,12791495,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Game
5,12504018,_STARTSTACK_ event_whenbroadcastreceived _MENU...,Game
989,12204110,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Game


In [136]:
train_df[train_df['label'] == "Animation"]

Unnamed: 0,id,project_text,label
866,236477301,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Animation
743,215517495,_STARTSTACK_ event_whenflagclicked _NEXT_ soun...,Animation
401,229847307,_STARTSTACK_ event_whenflagclicked _NEXT_ soun...,Animation
986,152404976,_STARTSTACK_ event_whenflagclicked _NEXT_ soun...,Animation
528,162561728,_STARTSTACK_ event_whenflagclicked _NEXT_ moti...,Animation
...,...,...,...
375,147108702,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Animation
516,11796918,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Animation
576,12566151,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...,Animation
972,229857679,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Animation


In [137]:
train_df[train_df['label'] == "Other"]

Unnamed: 0,id,project_text,label
705,91724853,_STARTSTACK_ event_whenbackdropswitchesto _MEN...,Other
712,196396672,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...,Other
371,288010892,_STARTSTACK_ event_whenflagclicked _NEXT_ moti...,Other
746,141206552,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...,Other
907,86714101,_STARTSTACK_ event_whenkeypressed _MENU_ menu_...,Other
...,...,...,...
852,122752480,_STARTSTACK_ event_whenflagclicked _NEXT_ cont...,Other
276,12494887,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Other
513,12978908,_STARTSTACK_ event_whenflagclicked _NEXT_ look...,Other
884,137605037,_STARTSTACK_ event_whenflagclicked _ENDSTACK_ ...,Other


### export datasets

In [138]:
# write both labels and project text to the first file
with open(CLASSIFICATION_TRAIN_TARGET, 'w+') as file:   
    for index, row in train_df.iterrows():
        label = "__label__" + row['label'].lower()
        project_text = row['project_text']
        line = f'{label} {project_text}\n'
        file.write(line)

# write only project text to the second file
with open(CLASSIFICATION_TRAIN_TARGET_NO_LABELS, 'w+') as file:
    for index, row in train_df.iterrows():
        project_text = row['project_text']
        line = f'{project_text}\n'
        file.write(line)
        
print(CLASSIFICATION_TRAIN_TARGET)
print(CLASSIFICATION_TRAIN_TARGET_NO_LABELS)

/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_train.txt
/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_train_no_labels.txt


In [139]:
with open(CLASSIFICATION_TEST_TARGET, 'w+') as file:
    for index, row in test_df.iterrows():
        label = "__label__" + row['label'].lower()
        project_text = row['project_text']
        line = f'{label} {project_text}\n'
        file.write(line)

# write only project text to the second file
with open(CLASSIFICATION_TEST_TARGET_NO_LABELS, 'w+') as file:
    for index, row in test_df.iterrows():
        project_text = row['project_text']
        line = f'{project_text}\n'
        file.write(line)
        
print(CLASSIFICATION_TEST_TARGET)
print(CLASSIFICATION_TEST_TARGET_NO_LABELS)

/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_test.txt
/Users/labdalla/Documents/Scratch-MEng/Workspace/supervised-learning/scratch-recommend/scratch-classify/dataset/classification_test_no_labels.txt
