In [1]:
import csv, os
from gensim.models import Word2Vec
from gensim.models import FastText

import re
import jsonlines


In [2]:
seed = 123

In [3]:
# hyperparams

dim = 200
method = "w2v" # ft
min_count = 10
epochs = 25

In [4]:
# read embddings corpus corpus_embeddings
root_path = os.path.join('..', '..', '..')

corpus = []
directory_path = os.path.join(root_path, 'data', 'python_train')
for filename in os.listdir(directory_path):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(directory_path, filename)
        with jsonlines.open(file_path) as reader:
            for obj in reader:
                corpus.append(obj['code'])

In [5]:
print(len(corpus))

412178


In [6]:
def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 

def stringToList(string):
    codeLinesList = []
    for line in string.splitlines():
        codeLinesList.append(line)
    return codeLinesList

def remove_comments(input_str):
    
    input_str = re.sub('#.*', '', input_str)
    input_str = re.sub('""".*"""', '', input_str)              

    doc_reg_1 = r'("""|\'\'\')([\s\S]*?)(\1\s*)(?=class)'
    doc_reg_2 = r'(\s+def\s+.*:\s*)\n(\s*"""|\s*\'\'\')([\s\S]*?)(\2[^\n\S]*)'
    input_str = re.sub(doc_reg_1, '', input_str)
    input_str = re.sub(doc_reg_2, r'\1', input_str)
    
    return input_str
    
def dropHeaders(lines):
    linList = []
    for line in lines:
        if not re.search('import',line):
             if not re.search('from',line):
                  linList.append(line)
    return linList

def dropBlank(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != '':
            tokens.append(temp)
    return tokens

def tokenizeLines(codeLinesList):
    codeTokens = []
    
    for line in codeLinesList:
        templineTokens = re.split('[\.,\[\];:(\s)?\\\\!\t{}"<>+=~*&^%/|\\-\']', line)
        codeTokens.extend(templineTokens)
    
    return codeTokens

def dataTokenization(corpus, filename, comments, strings, numericals, headers):
    
    allTokens = []
    for i in range(0, len(corpus)):
        stringLines = corpus[i]
        
        if numericals == 0:
            stringLinesNoDigit = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "<numId$>", stringLines) #replace numbers 
        else:
            stringLinesNoDigit = stringLines
        
        if strings == 0:
            stringLinesNoStr = re.sub(r'(["])(?:(?=(\\?))\2.)*?\1', "<strId$>", stringLinesNoDigit) #replace strings       
            stringLinesNoChar = re.sub(r"(['])(?:(?=(\\?))\2.)*?\1", "<strId$>", stringLinesNoStr) #replace chars
        else:
            stringLinesNoChar = stringLinesNoDigit
        
        if comments == 0:
            #remove comments from source code
            linesNoCom = remove_comments(stringLinesNoChar)
        else:
            linesNoCom = stringLinesNoChar
        
        #convert source code from string to list of lines
        lines = stringToList(linesNoCom)
        
        #remove headers
        if headers == 0:
            lines = dropHeaders(lines)
        
        #tokenize lines to list of words
        tokens0 = tokenizeLines(lines)
        
        #remove blank lines
        tokens = dropBlank(tokens0)
        
        #lower case
        for w in range(0, len(tokens)):
            tokens[w] = tokens[w].lower()
         
        allTokens.append(tokens)
        
    
#     with open(filename,"w", encoding="utf-8") as f:
#         wr = csv.writer(f)
#         wr.writerows(allTokens)
        
    return allTokens

In [7]:
def embVectors(dim, epochs, min_count, method, corpus, comments, strings, numericals, headers): 

    # preprocess corpus
    data = dataTokenization(corpus, "corpus.csv", comments, strings, numericals, headers)
    
#     with open("corpus.csv", newline='', encoding='utf-8') as f:
#         reader = csv.reader(x.replace('\0', '') for x in f)
#         data = list(reader)
    data = dropEmpty(data)
    
    if method == "w2v": 
        model = Word2Vec(data, vector_size=dim, workers=4, epochs=epochs, min_count=min_count) #, window=20
        fileEmb = method + '_embeddings.txt'
        model.wv.save_word2vec_format(fileEmb, binary=False)
    elif method == "ft":
        model_ted = FastText(vector_size=dim, min_count=min_count)
        model_ted.build_vocab(corpus_iterable=data)
        model_ted.train(corpus_iterable=data, total_examples=len(data), epochs=epochs)
        fileEmb = method + '_embeddings.txt'
        model_ted.wv.save_word2vec_format(fileEmb, binary=False)
    
    return fileEmb

In [8]:
fileEmb = embVectors(dim, epochs, min_count, method, corpus, 0, 0, 0, 0)