## Selecting a repo


In [3]:
!git clone https://github.com/Byaidu/PDFMathTranslate.git

fatal: destination path 'PDFMathTranslate' already exists and is not an empty directory.


In [56]:
repo_url = 'PDFMathTranslate'

## Bug-Fixing Commit identification

In [57]:
import pandas as pd
from pydriller import Repository

In [58]:
# naive approach of simple word matching

bug_keywords = ["fixed","bug","fixes","fix","crash","solves",
                "resolves","issue","regression","fall back",
                "assertion","coverity","reproducible","stack-wanted",
                "steps-wanted","testcase","failure","fail","npe",
                "except","broken","differential testing","error",
                "hang","test fix","steps to reproduce","leak",
                "stack trace","heap overflow","freez","problem",
                "overflow","avoid","workaround","break","stop"]


def is_bug_commit_naive(commit):
    message = commit.msg
    msg_lower = message.lower()
    return any(kw in msg_lower for kw in bug_keywords)

# regex based matching
import re

# Build one regex from all keywords
pattern = re.compile(
    r"\b(" + "|".join(re.escape(kw) for kw in bug_keywords) + r")\b",
    re.IGNORECASE
)

# not used in final version
def is_bug_commit_regex(commit):
    return bool(pattern.search(commit.msg))


def is_merge_commit(commit):
    return len(commit.parents) > 1


commit_data = []

for commit in Repository(repo_url).traverse_commits():
    diff = ''
    for file in commit.modified_files:
        diff += file.diff
    if is_bug_commit_naive(commit):
        commit_data.append({
            "hash": commit.hash,
            "message": commit.msg,
            "parents": commit.parents,
            "is_merge": commit.merge,
            "diff": diff,
            "files_modified": [mod.filename for mod in commit.modified_files]
        })
    


df = pd.DataFrame(commit_data)
csv_path = "potential_bug_fix_commits.csv"
df.to_csv(csv_path, index=False)

print(f"\nSaved {len(df)} potential bug-fixing commits to {csv_path}")



Saved 335 potential bug-fixing commits to potential_bug_fix_commits.csv


## Diff extraction and analysis

In [59]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

In [60]:
def infer(file):
    input_text = f"{file.source_code_before};{file.source_code}"

    # tokenize the and generate 
    inputs = tokenizer(input_text,return_tensors='pt')
    outputs = model.generate(**inputs)

    # decode and generated tokens
    prediction = tokenizer.decode(outputs[0],skip_special_tokens=True)
    return prediction

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydriller import Repository
import pandas as pd
import torch
import gc

# Load model once
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5").to(device)
model.eval()

MAX_INPUT_TOKENS = 512  
MAX_OUTPUT_TOKENS = 512 

def safe_infer(diff_text):
    """Run model inference safely using only file.diff."""
    if not diff_text:
        return ""
    
    # Tokenize & truncate
    inputs = tokenizer(
        diff_text,
        return_tensors='pt',
        max_length=MAX_INPUT_TOKENS,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=MAX_OUTPUT_TOKENS)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def collect_commit_data(repo_url):
    iter = 1000000000
    commit_data = []
    
    for commit in Repository(repo_url).traverse_commits():
        if iter == 0:
            return commit_data
        iter-=1
        if is_bug_commit_naive(commit):
            for file in commit.modified_files:
                try:
                    inference = safe_infer(file.diff)

                    commit_data.append({
                        "hash": commit.hash,
                        "filename": file.filename,
                        "diff": file.diff or "",
                        "llm_inference": inference,
                        "rectified_msg": ""
                    })

                except Exception as e:
                    print(f"Skipped file {file.filename} in commit {commit.hash}: {e}")
                
                # Free memory
                gc.collect()
                torch.cuda.empty_cache()

    return commit_data



commit_data = collect_commit_data(repo_url)

df = pd.DataFrame(commit_data)
csv_path = "llm_inference.csv"
df.to_csv(csv_path, index=False)

print(f"\nSaved {len(df)} potential bug-fixing files to {csv_path}")



Saved 510 potential bug-fixing commits to llm_inference.csv


In [62]:
!pip install groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Rectifier Formulation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydriller import Repository
import pandas as pd
import torch
import gc
from groq import Groq  
import os

import secret.api_key

groq_client = Groq(api_key=api_key)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5").to(device)
model.eval()

MAX_INPUT_TOKENS = 512
MAX_OUTPUT_TOKENS = 128


def infer(diff_text):
    
    if not diff_text or not diff_text.strip():
        return ""
    
    inputs = tokenizer(
        diff_text,
        return_tensors='pt',
        max_length=MAX_INPUT_TOKENS,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=MAX_OUTPUT_TOKENS)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def safe_truncate(text, max_chars=4000):
    
    if len(text) > max_chars:
        return text[:max_chars] + "\n...[TRUNCATED]..."
    return text

def groq_commit_message(prompt):
    
    prompt = safe_truncate(prompt, max_chars=4000)  # ~4000 chars ≈ 1500–2000 tokens

    chat_completion = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {
                "role": "system",
                "content": "You are an expert software engineer skilled at writing precise commit messages."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.3,
        max_tokens=150
    )
    return chat_completion.choices[0].message.content.strip()


def collect_commit_data(repo_url, limit=100):
    commit_data = []

    for commit in Repository(repo_url).traverse_commits():
        if limit == 0:
            break
        limit -= 1

        if is_bug_commit_naive(commit):
            file_tags = []

            for file in commit.modified_files:
                try:
                    llm_tag = infer(file.diff or "")
                    file_tags.append(f"{file.filename}: {llm_tag}")
                except Exception as e:
                    print(f"Skipped file {file.filename} in commit {commit.hash}: {e}")
                
                gc.collect()
                torch.cuda.empty_cache()

            combined_prompt = (
                f"Original commit message:\n{safe_truncate(commit.msg, 500)}\n\n" +
                f"Changes in files in format <file name>:<change message>\n" + 
                safe_truncate("\n".join(file_tags), 3000) +
                "\nTask: Write a concise, accurate commit message summarizing all changes. Respond in a single line only commit message."
            )

            try:
                rectified_commit_msg = groq_commit_message(combined_prompt)
            except Exception as e:
                rectified_commit_msg = f"[ERROR: {e}]"

            commit_data.append({
                "hash": commit.hash,
                "developer_msg": commit.msg,
                "rectified_commit_msg": rectified_commit_msg
            })
            # print(rectified_commit_msg)

    return commit_data


commit_data = collect_commit_data(repo_url, limit=1e8)


df = pd.DataFrame(commit_data)
csv_path = "rectified_msg.csv"
df.to_csv(csv_path, index=False)

print(f"\nSaved {len(df)} commit-level entries to {csv_path}")
print("Columns:", df.columns.tolist())



Saved 335 commit-level entries to rectified_msg1.csv
Columns: ['hash', 'developer_msg', 'rectified_commit_msg']


In [89]:
import pandas as pd
file1 = pd.read_csv("rectified_msg.csv")
file2 = pd.read_csv('llm_inference.csv')
file3 = pd.read_csv('potential_bug_fix_commits.csv')

In [90]:
print('file 01 colums')
print(file1.columns)

print("file 2 colums")
print(file2.columns)

print('file3 colums')
print(file3.columns)

file 01 colums
Index(['hash', 'developer_msg', 'rectified_commit_msg'], dtype='object')
file 2 colums
Index(['hash', 'filename', 'diff', 'llm_inference', 'rectified_msg'], dtype='object')
file3 colums
Index(['hash', 'message', 'parents', 'is_merge', 'diff', 'files_modified'], dtype='object')


In [103]:
import pandas as pd


# Creating the master csv file

file1 = pd.read_csv("rectified_msg.csv")   
file2 = pd.read_csv("llm_inference.csv")   
file3 = pd.read_csv("potential_bug_fix_commits.csv")  


agg_file2 = file2.groupby("hash").agg({
    "filename": lambda x: " ".join(x.astype(str)),
    "llm_inference": lambda x: " ".join(x.astype(str)),
    "diff": lambda x: " ".join(x.astype(str)),
    "rectified_msg": lambda x: " ".join(x.dropna().astype(str))
}).reset_index()



master_df = (
    file1
    .merge(agg_file2, on="hash", how="left")
    .merge(file3, on="hash", how="left")
)


master_df.to_csv("master_commits.csv", index=False)

print("Master DataFrame created and saved to master_commits.csv")
print("Columns:", master_df.columns.tolist())
# print(master_df.head()
master_df["rectified_commit_msg"].head()


Master DataFrame created and saved to master_commits.csv
Columns: ['hash', 'developer_msg', 'rectified_commit_msg', 'filename', 'llm_inference', 'diff_x', 'rectified_msg', 'message', 'parents', 'is_merge', 'diff_y', 'files_modified']


0    Add setup file, enhance text converter with co...
1     Add command line argument options and setup file
2    Refactor text converter and PDF interpreter to...
3    Bump version and add support for Chinese chara...
4    Enhance debugging capabilities and fix errors ...
Name: rectified_commit_msg, dtype: object

In [92]:
import pandas as pd
df = pd.read_csv("master_commits.csv")
df = df.dropna(axis=1, how='all')
print(df.columns)   
df.head()


Index(['hash', 'developer_msg', 'rectified_commit_msg', 'filename',
       'llm_inference', 'diff_x', 'message', 'parents', 'is_merge', 'diff_y',
       'files_modified'],
      dtype='object')


Unnamed: 0,hash,developer_msg,rectified_commit_msg,filename,llm_inference,diff_x,message,parents,is_merge,diff_y,files_modified
0,f719b6115b9638a8d5c6789ab29caae7e163e145,fix regex,"Add setup file, enhance text converter with co...",converter.py setup.py,add more comments to text converter add missin...,"@@ -364,7 +364,7 @@ class TextConverter(PDFCon...",fix regex,['1c84f1fe75f18caa55c0ff40f2fdaca1825f03c0'],False,"@@ -364,7 +364,7 @@ class TextConverter(PDFCon...","['converter.py', 'setup.py']"
1,1ef06a7fd3ab366ebe6c8b11d5008211d87e3efb,fix args,Add command line argument options and setup file,pdf2zh.py setup.py,add more options to the create_parser function...,"@@ -103,26 +103,26 @@ def create_parser() -> a...",fix args,['f719b6115b9638a8d5c6789ab29caae7e163e145'],False,"@@ -103,26 +103,26 @@ def create_parser() -> a...","['pdf2zh.py', 'setup.py']"
2,270c0e200d1fe1666e6057ec94bedb6e0bc434fb,fix lines and indent,Refactor text converter and PDF interpreter to...,converter.py pdfinterp.py,add more examples to text converter add missin...,"@@ -359,8 +359,11 @@ class TextConverter(PDFCo...",fix lines and indent,['1478a0eecbb8933410cedeff48ac9f844acd3ae4'],False,"@@ -359,8 +359,11 @@ class TextConverter(PDFCo...","['converter.py', 'pdfinterp.py']"
3,291eebd8dcb1206e7e2e5187b13a96d138a6b1b5,fix rt,Bump version and add support for Chinese chara...,__init__.py converter.py,update version add support for 公公公公公公公公公公公公公公公...,"@@ -1,2 +1,2 @@\n-__version__ = ""1.0.1""\n+__ve...",fix rt,['0fa56a1c75d1dfc6fb33d46a0baa5d69ea047eb5'],False,"@@ -1,2 +1,2 @@\n-__version__ = ""1.0.1""\n+__ve...","['__init__.py', 'converter.py']"
4,ac2e14192cdfee974bf8333688270c24453a8ebe,debug,Enhance debugging capabilities and fix errors ...,__init__.py cmapdb.py converter.py encodingdb....,update version add missing classes to cmap add...,"@@ -1,2 +1,2 @@\n-__version__ = ""1.0.4""\n+__ve...",debug,['ad7bcc919f064489de1e217cfbcb34bad5b44e5e'],False,"@@ -1,2 +1,2 @@\n-__version__ = ""1.0.4""\n+__ve...","['__init__.py', 'cmapdb.py', 'converter.py', '..."


## Evaluation research Questions

In [93]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn.functional as F

# Load CodeBERT
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

In [None]:
def get_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state[:,0,:]


def similarity(emb1, emb2):
    return F.cosine_similarity(emb1, emb2).item()

def normalize_text(val):
    if isinstance(val, list):
        return " ".join(str(x) for x in val)
    elif isinstance(val, str):
        return val
    else:
        return ""  

### Developer Eval

In [108]:
import pandas as pd

df = pd.read_csv("master_commits.csv")

total = 0
hits = 0

for _, row in df.iterrows():
    
    inf_text = normalize_text(row["diff_y"])
    msg_text = normalize_text(row["developer_msg"])

    # if not inf_text or not msg_text:
    #     continue  

    sim = similarity(get_embedding(inf_text), get_embedding(msg_text))

    if sim > 0.95:
        hits += 1

    total += 1

print(f"Total: {total}, Hits: {hits}, Accuracy: {hits/total*100:.2f}%")


Total: 335, Hits: 130, Accuracy: 38.81%


### LLM Eval

In [107]:
import pandas as pd

df = pd.read_csv("master_commits.csv")

def normalize_text(val):
    if isinstance(val, list):
        return " ".join(str(x) for x in val)
    elif isinstance(val, str):
        return val
    else:
        return ""  

total = 0
hits = 0

for _, row in df.iterrows():
    
    inf_text = normalize_text(row["diff_y"])
    msg_text = normalize_text(row["llm_inference"])

    # if not inf_text or not msg_text:
    #     continue  

    sim = similarity(get_embedding(inf_text), get_embedding(msg_text))

    if sim > 0.95:
        hits += 1

    total += 1

print(f"Total: {total}, Hits: {hits}, Accuracy: {hits/total*100:.2f}%")


Total: 335, Hits: 142, Accuracy: 42.39%


### Rectifier Eval

In [106]:
import pandas as pd

df = pd.read_csv("master_commits.csv")

def normalize_text(val):
    if isinstance(val, list):
        return " ".join(str(x) for x in val)
    elif isinstance(val, str):
        return val
    else:
        return ""  

total = 0
hits = 0

for _, row in df.iterrows():
    
    inf_text = normalize_text(row["diff_y"])
    msg_text = normalize_text(row["rectified_commit_msg"])
    # print(row["rectified_msg"])
    # print(msg_text)

    # if not inf_text or not msg_text:
    #     continue  

    sim = similarity(get_embedding(inf_text), get_embedding(msg_text))

    if sim > 0.95:
        hits += 1

    total += 1

print(f"Total: {total}, Hits: {hits}, Accuracy: {hits/total*100:.2f}%")


Total: 335, Hits: 217, Accuracy: 64.78%
