# Environment

In [1]:
import csv
import psycopg2
from psycopg2 import Error

import sys
import subprocess

from sklearn.model_selection import train_test_split
import pandas as pd
import ast
import os

# Helper Function

In [15]:
def create_folders(folder_path='csv files', additional_folders=None):
    """
    This function will check if a folder exists, if it does not it will create one with the path from the input, if a
    tuple of additional folders is defined, they are also created inside the main folder.

    :param folder_path: Path to main folder to be created
    :param additional_folders: Tuple of strings with the name of the sub-folders to be created, if none are defined none
    are created
    """
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        if additional_folders is not None:
            for additional_folder in additional_folders:
                additional_folder_path = os.path.join(folder_path, additional_folder)
                os.mkdir(additional_folder_path)
        print("Folder %s created!" % folder_path)
    else:
        print("Folder %s already exists" % folder_path)
        if additional_folders is not None:
            for additional_folder in additional_folders:
                additional_folder_path = os.path.join(folder_path, additional_folder)
                os.mkdir(additional_folder_path)


# Classify cases based on bug and smells flags

In [3]:
# check whether directory already exists and if it does not, create it
create_folders('csv files',('tokenizer data', 'tokenized'))

Folder csv files already exists


FileExistsError: [Errno 17] File exists: 'csv files/tokenizer data'

In [16]:
# Connect to an existing database
connection = psycopg2.connect(host='127.0.0.1',
                              user='transfer_learning',
                              password='transfer_learning',
                              dbname='transfer_learning')
# Create a cursor to perform database operations
cursor = connection.cursor()

try:
    # Check if the table exists
    cursor.execute("SELECT 1 FROM public.class LIMIT 1;")
    print("Table 'class' exists and is accessible.")

    # Fetch all cases that are a bug fix
    postgreSQL_select_Query = 'SELECT * FROM public.class WHERE bug_fix = %s'
    cursor.execute(postgreSQL_select_Query, ('true',))
    cases = cursor.fetchall()

    with open('csv files/tokenizer data/harmful_tokenizer_data.csv', 'w', encoding="utf-8", newline='') as csvfile:
        fieldnames = ['id', 'language', 'text', 'smells', 'project', 'metrics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        counter = 0

        for case in cases:
            db_id = case[0]
            db_language = case[2]
            db_content = case[8]
            db_smells = case[13]
            db_project = case[3]
            db_metrics = case[9]
            # If there are any invalid values in that row skip it
            if db_id < 0 or db_language == '' or db_content == '' or db_smells == {}:
                continue
            # If there are no smells in row, skip it
            if not any(db_smells.values()):
                continue
            writer.writerow({'id': db_id, 'language': db_language, 'text': db_content, 'smells': db_smells, 'project': db_project, 'metrics': db_metrics})
            counter += 1
        else:
            print(' ')
            print('All harmful cases sorted, ' + str(counter) + ' total cases.')

    # Fetch all cases that are not a bug fix for each language
    postgreSQL_select_Query = 'SELECT * FROM public.class WHERE bug_fix = %s'
    cursor.execute(postgreSQL_select_Query, ('false',))
    cases = cursor.fetchall()

    with open('csv files/tokenizer data/clean_tokenizer_data.csv', 'w', encoding="utf-8", newline='') as csvfile:
        fieldnames = ['id', 'language', 'text', 'smells', 'project', 'metrics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        counter = 0

        for case in cases:
            db_id = case[0]
            db_language = case[2]
            db_content = case[8]
            db_smells = case[13]
            db_project = case[3]
            db_metrics = case[9]
            # If there are any invalid values in that row skip it
            if db_id < 0 or db_language == '' or db_content == '' or db_smells == {}:
                continue
            # If there are smells in row, skip it
            if any(db_smells.values()):
                continue
            writer.writerow({'id': db_id, 'language': db_language, 'text': db_content, 'smells': db_smells, 'project': db_project, 'metrics': db_metrics})
            counter += 1
        else:
            print(' ')
            print('All clean code cases sorted, ' + str(counter) + ' total cases.')

except (Exception, Error) as error:
    print(' ')
    print('Error while connecting to PostgreSQL', error)

finally:
    if connection:
        cursor.close()
        print(' ')
        print('PostgreSQL connection is closed')
        connection.close()


Table 'class' exists and is accessible.
 
All harmful cases sorted, 339 total cases.
 
All clean code cases sorted, 8813 total cases.
 
PostgreSQL connection is closed


# Tokenizer constants
### A tokenizer executable is available on the project repository

In [17]:
# Set the string bellow to the path to a tokenizer executable file in Linux format
TOKENIZER_BIN = r"./tokenizer"

# File names to run on tokenizer
FILE_NAMES = ['harmful', 'clean']

# Tokenizer Helper Functions

In [18]:
def get_tokens(language, file):
    tokens = ''
    cmd = [TOKENIZER_BIN, '-l', language, file]
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE)

    try:
        output, error = process.communicate()
        tokens = output.decode('utf8')
        tokens = tokens.replace('\t', ' ')
        tokens = tokens.replace('\n', '')
    except Exception as e:
        print('Unexpected Error on Get Tokens', e)

    return tokens


def create_tmp_file(code_text):
    try:
        with open('experiment.tmp', 'w+') as file:
            file.write(code_text)
            return 'experiment.tmp'
    except Exception as e:
        print('Unexpected Error on Create Tmp File', e)
        return None

# Run Tokenizer to get text tokens for each file

In [19]:
# Increase the CSV field size limit
csv.field_size_limit(sys.maxsize)


for file in FILE_NAMES:
    tokenizer_file_path = 'csv files/tokenizer data/' + file + '_tokenizer_data.csv'
    # Open csv with data to create tokens and save them all in a new file
    with open(tokenizer_file_path, encoding="utf-8", newline='') as csvfile1:
        reader = csv.DictReader(csvfile1)

        tokenized_file_path = 'csv files/tokenized/' + file + '_tokenized.csv'
        with open(tokenized_file_path, 'w', encoding="utf-8", newline='') as csvfile2:
            fieldnames = ['id', 'language', 'text', 'smells', 'tokens', 'metrics']
            writer = csv.DictWriter(csvfile2, fieldnames=fieldnames)
            writer.writeheader()

            for row in reader:
                csv_id = row['id']
                csv_language = row['language']
                csv_text = row['text']
                csv_smells = row['smells']
                csv_metrics = row['metrics']
                temp_file = create_tmp_file(csv_text)
                result_tokens = get_tokens(csv_language, temp_file)
                writer.writerow({'id': csv_id, 'language': csv_language, 'text': csv_text, 'smells': csv_smells,
                                 'tokens': result_tokens, 'metrics': csv_metrics})

# Create Train and Test files for each case

In [21]:
# check whether directory already exists and if it does not, create it
harmful_clean_path = os.path.join('csv files', 'harmful-clean')
create_folders(harmful_clean_path, ('all', 'train', 'test'))

# TODO: Get max value from each case to set padding_tokens value in perceptron
# Get data from csv file
print(' ')
print('Largest token size:')
print(' ')

print('harmful code:')
df1 = pd.read_csv('csv files/tokenized/harmful_tokenized.csv')
split_tokens = df1.tokens.str.split(' ')
print(split_tokens.str.len().max())

print('clean code:')
df2 = pd.read_csv('csv files/tokenized/clean_tokenized.csv')
split_tokens = df2.tokens.str.split(' ')
print(split_tokens.str.len().max())

# Check how many languages there are
languages = df1['language'].unique()

# Separate in individual Data Frames for each language
for language in languages:
    lang_df1 = df1.loc[df1['language'] == language]
    lang_df2 = df2.loc[df2['language'] == language]

    print(' ')
    print('[' + language + ']')
    print('harmful code cases:')
    print(len(lang_df1))
    print('clean code cases:')
    print(len(lang_df2))

    file_name1 = language + '_' + 'Harmful' + '.csv'

    # Open file inside new directory
    with open((harmful_clean_path + '/all/' + file_name1), 'w', encoding="utf-8", newline='') as csvfile:
        fieldnames = ['id', 'language', 'text', 'smell', 'tokens', 'metrics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for index, row in lang_df1.iterrows():
            csv_id1 = row['id']
            text1 = row['text']
            tokens1 = row['tokens']
            smells1 = ast.literal_eval(row['smells'])
            metrics1 = row['metrics']

            if not any(smells1.values()):
                print('Error: row in data for harmful code without at least one smell')
                break
            smell_val1 = 1

            writer.writerow(
                {'id': csv_id1, 'language': language, 'text': text1, 'smell': smell_val1, 'tokens': tokens1, 'metrics': metrics1})

    file_name2 = language + '_' + 'Clean' + '.csv'

    # Open file inside new directory
    with open((harmful_clean_path + '/all/' + file_name2), 'w', encoding="utf-8", newline='') as csvfile:
        fieldnames = ['id', 'language', 'text', 'smell', 'tokens', 'metrics']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for index, row in lang_df2.iterrows():
            csv_id2 = row['id']
            text2 = row['text']
            tokens2 = row['tokens']
            smells2 = ast.literal_eval(row['smells'])
            metrics2 = row['metrics']

            if any(smells2.values()):
                print('Error: row in data for clean code with at least one smell')
                break
            smell_val2 = 0

            writer.writerow(
                {'id': csv_id2, 'language': language, 'text': text2, 'smell': smell_val2, 'tokens': tokens2, 'metrics': metrics2})

for language in languages:
    harmful_name = language + '_' + 'Harmful'
    open_path = harmful_clean_path + '/all/' + harmful_name + '.csv'
    harmful_df = pd.read_csv(open_path)

    clean_name = language + '_' + 'Clean'
    open_path = harmful_clean_path + '/all/' + clean_name + '.csv'
    clean_df = pd.read_csv(open_path)

    # Check witch case is smaller and use its length
    if len(harmful_df) < len(clean_df):
        clean_df_harmful_vs_clean = clean_df[clean_df.index < len(harmful_df)]
        harmful_df_harmful_vs_clean = harmful_df
    else:
        harmful_df_harmful_vs_clean = harmful_df[harmful_df.index < len(clean_df)]
        clean_df_harmful_vs_clean = clean_df

    train1_harmful_clean, test1_harmful_clean = train_test_split(clean_df_harmful_vs_clean, test_size=0.2)
    train2_harmful_clean, test2_harmful_clean = train_test_split(harmful_df_harmful_vs_clean, test_size=0.2)

    concat_train_harmful_clean = pd.concat([train1_harmful_clean, train2_harmful_clean])
    concat_test_harmful_clean = pd.concat([test1_harmful_clean, test2_harmful_clean])

    file_name = language + '_' + 'HarmfulVsClean'
    header = ['id', 'language', 'text', 'smell', 'tokens', 'metrics']
    train_path = harmful_clean_path + '/train/' + file_name + '_Train_1.csv'
    concat_train_harmful_clean.to_csv(train_path, header=header, encoding='utf-8', index=False)
    test_path = harmful_clean_path + '/test/' + file_name + '_Test_1.csv'
    concat_test_harmful_clean.to_csv(test_path, header=header, encoding='utf-8', index=False)


Folder csv files/harmful-clean created!
 
Largest token size:
 
harmful code:
52224
clean code:
7360
 
[Java]
harmful code cases:
106
clean code cases:
2617
 
[C#]
harmful code cases:
36
clean code cases:
91
 
[C++]
harmful code cases:
72
clean code cases:
1308
 
[Python]
harmful code cases:
125
clean code cases:
4797
