In [4]:
import re
from tqdm.notebook import tqdm
from builtins import any as b_any
import enchant
import pandas as pd

In [5]:
col_names = ["session","date","level","code","server_error","client_error","version","submission_id","lang", "email", "username", "is_test", "demo","start"]

def banned_word_check(banned, word):
    if word in banned:
        return True
    return False
    
def personal_word_check(d, word):
    if len(word) < 1:
        return False
    if not (d.check(word) or d.check(word[1:]) or d.check(word[:-1])):
        if not (word.isalpha() or word[1:].isalpha() or word[:-1].isalpha()):
            return True
    return False

def process_code(code, level, banned, d):
    concepts = get_concepts(level)
    offensive_counter = 0
    personal_counter = 0
    processedCode = []
    
    for line in tqdm(code):
        words = line.split()
        line = ""
        for word in words:
            if word in concepts:
                line += word + " "
            elif banned_word_check(banned, word):
                line = None
                offensive_counter += 1
                break
            elif personal_word_check(d, word):
                line = None
                personal_counter += 1
                break 
            else:
                line += re.sub(r"[a-z|A-Z|0-9|!?,''{}]", "%", word)
                line += " "
        if line:
            line = line.split()
            temp = ""
            for processed in line:
                if "%" in processed:
                    temp += "% "
                else:
                    temp += processed + " "
            line = temp
        if not line:
            processedCode.append("-")
        else:
            processedCode.append(line)
    return processedCode, offensive_counter, personal_counter

def get_concepts(level):
    if level == 1:
        return ['print', 'ask', 'echo']
    elif level == 2:
        return ['print', 'ask', 'at', 'random']
    elif level == 3:
        return ['print', 'is', 'ask', 'at', 'random']
    elif level == 4:
        return ['print', 'is', 'ask', 'at', 'random', 'if', 'else']
    elif level == 5:
        return ['print', 'is', 'ask', 'at', 'random', 'if', 'else', 'repeat', 'times']
    elif level in [6, 7]:
        return ['print', 'is', 'ask', 'at', 'random', 'if', 'else', 'repeat', 'times', '+', '-', '*']
    elif level in [8, 9, 10]:
        return ['print', 'is', 'ask', 'at', 'random', 'if', 'else', 'for', 'in', 'range', '+', '-', '*']
    return []

def get_files(language):
    if language == "nl":
        d = enchant.Dict("nl_NL")
    elif language == "en":
        d = enchant.Dict("en_US")
    else:
        print("Unfortunately your selected language is currently not supported...")
        exit(1)
        
    for level in range(1, 11):
        offensive_counter = 0
        personal_counter = 0
        
        df = pd.read_csv("../../Data/2-logs-plain0605.csv", names=col_names)
        df = df.iloc[1:]
        df = df.loc[(df['server_error'] == "None") | (df['server_error'] == "-")]
        df = df.loc[df['demo'] != True]
        df = df.loc[df['start'] != True]

        df = df.loc[df['lang'] == language]
        df['level'] = pd.to_numeric(df.level, errors='coerce')
        df = df.loc[df['level'] == level]

        with open("bad-words-" + language + ".txt") as f:
            banned = f.read().splitlines()  #https://www.cs.cmu.edu/~biglou/resources/

        df = df.drop_duplicates(subset=['level', 'code'], keep='first')
        df = df[["code","level"]]
        code = df['code'].values.astype('U').tolist()
        processedCode, offensive_counter, personal_counter = process_code(code, level, banned, d)

        print("We've deleted " + str(offensive_counter) + " correct but offensive programs")
        if language == "nl":
            print("We've also deleted " + str(personal_counter) + " programs containing non-dutch words")
        elif language == "en":
            print("We've also deleted " + str(personal_counter) + " programs containing non-english words")

        df['processed_code'] = processedCode
        df = df.loc[df['processed_code'] != "-"]
        current_amount = df.shape[0]
        df = df.drop_duplicates(subset=['level', 'processed_code'], keep='first')
        after_duplicate_drop_amount = df.shape[0]
        print("We've also deleted " + str(current_amount - after_duplicate_drop_amount) + " duplicate programs")
        df = df.drop(['level'], axis=1)
        #df.to_csv('level' + str(level) + '.csv', encoding='utf-8', index=False)

        print("We've saved a total of " + str(df.shape[0]) + " unique programs to: level" + str(level) + ".csv")

get_files("en")

HBox(children=(FloatProgress(value=0.0, max=22598.0), HTML(value='')))


We've deleted 1881 correct but offensive programs
We've also deleted 6113 programs containing non-english words
We've also deleted 10873 duplicate programs
We've saved a total of 3731 unique programs to: level1.csv


HBox(children=(FloatProgress(value=0.0, max=51909.0), HTML(value='')))


We've deleted 2315 correct but offensive programs
We've also deleted 11442 programs containing non-english words
We've also deleted 26360 duplicate programs
We've saved a total of 11792 unique programs to: level2.csv


HBox(children=(FloatProgress(value=0.0, max=16226.0), HTML(value='')))


We've deleted 955 correct but offensive programs
We've also deleted 5888 programs containing non-english words
We've also deleted 6213 duplicate programs
We've saved a total of 3170 unique programs to: level3.csv


HBox(children=(FloatProgress(value=0.0, max=11721.0), HTML(value='')))


We've deleted 882 correct but offensive programs
We've also deleted 7077 programs containing non-english words
We've also deleted 1910 duplicate programs
We've saved a total of 1852 unique programs to: level4.csv


HBox(children=(FloatProgress(value=0.0, max=9805.0), HTML(value='')))


We've deleted 947 correct but offensive programs
We've also deleted 5312 programs containing non-english words
We've also deleted 1819 duplicate programs
We've saved a total of 1727 unique programs to: level5.csv


HBox(children=(FloatProgress(value=0.0, max=7957.0), HTML(value='')))


We've deleted 202 correct but offensive programs
We've also deleted 4792 programs containing non-english words
We've also deleted 1526 duplicate programs
We've saved a total of 1437 unique programs to: level6.csv


HBox(children=(FloatProgress(value=0.0, max=39902.0), HTML(value='')))


We've deleted 618 correct but offensive programs
We've also deleted 28906 programs containing non-english words
We've also deleted 6126 duplicate programs
We've saved a total of 4252 unique programs to: level7.csv


HBox(children=(FloatProgress(value=0.0, max=920.0), HTML(value='')))


We've deleted 10 correct but offensive programs
We've also deleted 565 programs containing non-english words
We've also deleted 138 duplicate programs
We've saved a total of 207 unique programs to: level8.csv


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))


We've deleted 23 correct but offensive programs
We've also deleted 234 programs containing non-english words
We've also deleted 91 duplicate programs
We've saved a total of 152 unique programs to: level9.csv


HBox(children=(FloatProgress(value=0.0, max=463.0), HTML(value='')))


We've deleted 6 correct but offensive programs
We've also deleted 256 programs containing non-english words
We've also deleted 71 duplicate programs
We've saved a total of 130 unique programs to: level10.csv
