In [None]:
#Process all the files in the sampled dataset
import os
import pandas as pd
import requests

# Hugging Face API URL for zero-shot classification
api_url = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
hf_key = "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"

# Function to handle long paths by applying the \\?\ prefix for Windows if needed
def handle_long_path(file_path):
    abs_file_path = os.path.abspath(file_path)
    if os.name == 'nt' and len(abs_file_path) >= 260:  # Windows specific and path length limit
        file_path = f"\\\\?\\{abs_file_path}"
    return file_path

# Function to process file content with encoding handling
def process_file(file_path):
    file_path = handle_long_path(file_path)  # Handle long paths

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None, -1

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                file_content = f.read()
            print(f"Processed with latin-1 encoding: {file_path}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            return None, -1

    return file_content, 1

# Function to call Hugging Face API for zero-shot classification
def run_zero_shot_classification(file_content):
    headers = {
        "Authorization": f"Bearer {hf_key}"
    }

    payload = {
        "inputs": file_content,
        "parameters": {
            "candidate_labels": ["1", "0"],  # Labels for classification
            "multi_label": False
        }
    }

    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 200:
        result = response.json()
        # Get the label with the highest score
        predicted_label = result['labels'][0]
        return predicted_label, None
    else:
        return None, f"Error: {response.status_code}, {response.text}"



#paths used
src_dir = 'data/src_files-sampled'
input_dir = 'data/input'
output_dir = 'data/output'
batch_dir = 'data/output/batches'
input_csv = os.path.join(input_dir, 'sampled_files_codelama_experiment.csv')
combined_output_csv = os.path.join(output_dir, 'samples_codelama_api_output.csv')

# Load the sampled dataset CSV
sampled_files = pd.read_csv(input_csv)

# Define batch size and the number of files per chunk
batch_size = 25  # Number of rows per chunk
batch_number = 0  # Start with the first batch

# Get total number of rows
total_rows = sampled_files.shape[0]
print(f"Total number of files: {total_rows}")

# Loop through the dataset in chunks
for start_idx in range(0, total_rows, batch_size):
    # Define the end index for the current batch
    end_idx = min(start_idx + batch_size, total_rows)
    batch = sampled_files.iloc[start_idx:end_idx].copy()

    # Prepare lists to store the results and error messages for the current batch
    predictions = []
    error_messages = []

    for idx, row in batch.iterrows():
        project_name = row['Project_name']
        github_path = row['github_path'].lstrip('/')  # Clean up the path by removing leading slashes

        # Construct the file path
        file_path = os.path.join(src_dir, project_name, github_path)
        file_path = os.path.normpath(file_path)

        # Process the file with encoding handling
        file_content, status = process_file(file_path)

        # Run Hugging Face API inference if the file was processed successfully
        if status != -1:
            try:
                prediction, error_message = run_zero_shot_classification(file_content)
                predictions.append(prediction)
                error_messages.append(error_message)  # Append error message (None if no error)
            except Exception as e:
                print(f"Error during Hugging Face API inference for file {file_path}: {e}")
                predictions.append(None)
                error_messages.append(str(e))
        else:
            predictions.append(None)
            error_messages.append(f"File processing failed for {file_path}")

    # Add the predictions and error messages to the batch DataFrame
    batch['api_prediction'] = predictions
    batch['api_error_message'] = error_messages

    # Save the current batch to a separate CSV file 
    # and combine the name with the output directory
    batch_file_name = os.path.join(batch_dir, f'batch_{batch_number}.csv')
    
    os.makedirs(os.path.dirname(batch_file_name), exist_ok=True)
    batch.to_csv(batch_file_name, index=False)
    print(f"Batch {batch_number} saved as {batch_file_name}")

    # Increment batch number for the next iteration
    batch_number += 1

# At the end, combine all batch CSVs into a single final CSV file
# and join path with csv_files
csv_files = [os.path.join(batch_dir, f'batch_{i}.csv') for i in range(batch_number)]

# Load all batch CSVs and combine them into one DataFrame
combined_df = pd.concat([pd.read_csv(f) for f in csv_files])

# Save the combined result into a final CSV file
os.makedirs(os.path.dirname(combined_output_csv), exist_ok=True)
combined_df.to_csv(combined_output_csv, index=False)
print(f"All batches combined and saved as {combined_output_csv}")


In [None]:
# delete the batch files in the batch directory
for f in csv_files:
    os.remove(f)
    print(f"Deleted {f}")