In [2]:
# Using Hugging Face transformers for performance bug prediction with codellama/CodeLlama-7b-hf

import os
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, CodeLlamaTokenizer
import torch

In [None]:
#Set up Hugging Face Authentication
print("Setting up Hugging Face authentication...")
os.environ["HF_TOKEN"] = "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"  

#Set up Hugging Face model, to be used later for 1- Tokenizer, 2- Model Initialization
model_name = "codellama/CodeLlama-7b-hf"

#paths used
input_dir = 'data/input'
src_files_dir = 'data/src_files-sampled'
output_dir = 'data/output'
sampled_input_csv = os.path.join(input_dir, 'sampled_files_codelama_experiment.csv')
results_csv = os.path.join(output_dir, 'codellama_classification_results.csv')

In [None]:
from transformers import AutoConfig

# Load the model configuration
model_name = "codellama/CodeLlama-7b-hf"
config = AutoConfig.from_pretrained(model_name)

# Print the model's maximum token limit
print(f"Max token length for {model_name}: {config.max_position_embeddings}")


In [None]:
# Load the CSV File
print("Loading CSV file...")
sampled_files = pd.read_csv(sampled_input_csv)
print("CSV file loaded successfully.")

In [6]:
# Function to open files safely, handling long paths and normalization
def open_file(project_name, github_path):
    # Ensure github_path is clean (remove any leading slashes)
    github_path_clean = github_path.lstrip('/').lstrip('\\')
    
    # Construct the full file path
    file_path = os.path.join(src_files_dir, project_name, github_path_clean)
    file_path = os.path.normpath(file_path)
    abs_file_path = os.path.abspath(file_path)
    
    # Apply the \\?\ prefix for long paths on Windows
    if os.name == 'nt' and len(abs_file_path) >= 260:
        abs_file_path = f"\\\\?\\{abs_file_path}"
    
    # Check if the file exists
    if os.path.exists(abs_file_path):
        try:
            with open(abs_file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            return content
        except Exception as e:
            print(f"Error reading file {abs_file_path}: {e}")
            return None
    else:
        print(f"File not found: {abs_file_path}")
        return None

In [None]:
# Prepare the Dataset
print("Preparing the dataset...")

# Prepare lists to store data
codes = []
labels = []
project_names = []
github_paths = []

# Loop over all files in the CSV
for index, file_row in sampled_files.iterrows():
    project_name = file_row['Project_name']
    github_path = file_row['github_path']
    label = file_row['label']  # Assuming label is 0 or 1

    # Read the Java file
    java_code = open_file(project_name, github_path)
    if java_code is None:
        print(f"Failed to read the Java file at {github_path}. Skipping.")
        continue  # Skip this file and continue with the next

    # Append data to the lists
    codes.append(java_code)
    labels.append(label)
    project_names.append(project_name)
    github_paths.append(github_path)

# Create the dataset
data = {'code': codes, 'label': labels}
dataset = Dataset.from_dict(data)
print(f"Dataset prepared successfully with {len(codes)} code snippets.")

In [None]:
# Tokenize the Dataset and find the maximum number of tokens and frequency of tokens greater than 16,384
print("Checking the maximum number of tokens in the dataset...")

# Initialize the tokenizer
tokenizer = CodeLlamaTokenizer.from_pretrained(model_name, token=os.environ["HF_TOKEN"])

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize each example and return token lengths
def get_token_lengths(examples):
    # Tokenize without truncation and return token lengths
    tokenized = tokenizer(examples['code'], truncation=False)
    return {"token_length": [len(t) for t in tokenized["input_ids"]]}

# Apply the function to the dataset
token_lengths = dataset.map(get_token_lengths, batched=True)

# Find the maximum token length
max_tokens = max(token_lengths["token_length"])
print(f"The maximum number of tokens in the dataset is: {max_tokens}")

# Count how many examples exceed the token limit of 16,384
token_limit = 16384
exceeding_tokens_count = sum(1 for length in token_lengths["token_length"] if length > token_limit)
print(f"Number of code snippets exceeding {token_limit} tokens: {exceeding_tokens_count}")


In [None]:
#Tokenize the Dataset
print("Tokenizing the dataset...")
tokenizer = CodeLlamaTokenizer.from_pretrained(model_name, token=os.environ["HF_TOKEN"])

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['code'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print("Tokenization completed.")

In [None]:
# Step 6: Initialize the Model
print("Initializing the model...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, use_auth_token=os.environ["HF_TOKEN"])
print("Model initialized successfully.")


In [None]:
# Step 7: Zero-Shot Classification
print("Performing zero-shot classification...")

def zero_shot_classification(code_snippet):
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.item()

print("Performing zero-shot classification on all code snippets...")
predictions = []
for code_snippet in codes:
    #Add some interaactivity to show progress
    print(f"Processing code snippet {codes.index(code_snippet)+1} of {len(codes)}")
    prediction = zero_shot_classification(code_snippet)
    predictions.append(prediction)
print("Zero-shot classification completed.")

In [None]:
# Step 8: Save Results (optional)
print("Saving results...")
predictions_df = pd.DataFrame({'project_name': project_names,
                                'github_path': github_paths,
                                  'label': labels,
                                    'prediction': predictions})
predictions_df.to_csv(results_csv, index=False)
print(predictions_df.shape)
print("Results saved to codellama_classification_results.csv")


In [None]:
# plot the confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predictions_df['label'], predictions_df['prediction'])
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
print("Confusion matrix generated successfully.")


In [None]:
# Compute all the metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(predictions_df['label'], predictions_df['prediction'])
precision = precision_score(predictions_df['label'], predictions_df['prediction'])
recall = recall_score(predictions_df['label'], predictions_df['prediction'])
f1 = f1_score(predictions_df['label'], predictions_df['prediction'])
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("All metrics computed successfully.")