In [1]:
import os
%pwd

'c:\\testcasegen\\research'

In [5]:
%cd ../

c:\testcasegen


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [70]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataTransformationConfig:
    data_path: Path
    embedding_model: str
    model_url: str

In [71]:
from llmRAGtestcasegen.constants import *
from llmRAGtestcasegen.utils.common import read_yaml, create_directories, download_file

In [55]:
from dotenv import load_dotenv
import pinecone
from llmRAGtestcasegen.logging import logger
import time
import torch
from tqdm import tqdm
import pandas as pd
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)
import wget

In [None]:
class ConfigManager:
    def __init__(
            self, 
            config__filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config__filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) 
    def download_file(self, url):
        current_dir = os.getcwd()
        if not os.path.exists(os.path.join(current_dir + "unixcoder.py")):
            try:
                filename = wget.download(url)
                print(f"\nDownloaded: {filename}")
            except Exception as e:
                print(f"An error occurred: {e}")
        else:
            print("File already exists")
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        dataconfig =  DataTransformationConfig(
            data_path= config.data_path,
            embedding_model = config.embedding_model,
            model_url = config.model_url
        )
        download_file(config.model_url)
        return dataconfig


In [74]:
open_ai_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [75]:
from unixcoder import UniXcoder


In [76]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.java_splitter = RecursiveCharacterTextSplitter.from_language(
            language=Language.JAVA, chunk_size=500, chunk_overlap=150
        )
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.embedding_model = UniXcoder("microsoft/unixcoder-base")
        self.embedding_model.to(self.device)

    def get_embeddings_codebert(self, code_corpus):
        embeddings = []
        for code in code_corpus:
            tokens_ids = self.embedding_model.tokenize([code], mode="<encoder-only>")
            source_ids = torch.tensor(tokens_ids).to(self.device)
            _, nl_embedding = self.embedding_model(source_ids)
            norm_nl_embedding = torch.nn.functional.normalize(nl_embedding, p=2, dim=1)
            norm_nl_embedding = norm_nl_embedding.detach().cpu().numpy()[0].tolist()
            embeddings.append(norm_nl_embedding)
        return embeddings

    def create_embeddings(self):
        pinecone.init(api_key=pinecone_api_key, environment='gcp-starter')
        index_name = 'd4jcodebertembeds'

        if index_name not in pinecone.list_indexes():
            pinecone.create_index(index_name, dimension=768, metric='cosine')
            # Wait for index to initialize
            while not pinecone.describe_index(index_name).status['ready']:
                time.sleep(1)
        index = pinecone.Index(index_name)
        logger.info(f"Index {index_name} created successfully with info: {index.describe_index_stats()}")

        # Load and preprocess data
        df = pd.read_csv(Path(self.config.data_path), usecols=['Class', 'Methods', 'Repository', 'TestCase'])
        df = df[df['Repository'].isin(['commons-jxpath', 'commons-csv'])].reset_index(drop=True)
        
        # Split each code file into chunks
        new_rows = []
        for _, x in df.iterrows():
            java_chunks = self.java_splitter.create_documents([x['Methods']])
            new_rows.extend({'Class': x['Class'], 'Methods': chunk.page_content, 'Repository': x['Repository']}
                            for chunk in java_chunks)
        
        new_df = pd.DataFrame(new_rows)
        logger.info(f"Dataframe shape: {new_df.shape}")

        # Process and upload in batches
        batch_size = 4
        for i in tqdm(range(0, len(new_df), batch_size)):
            batch = new_df.iloc[i:i + batch_size]
            ids = [str(j) for j in batch.index]
            texts = batch['Methods'].tolist()
            embeddings = self.get_embeddings_codebert(texts)

            metadata = [{'Class': row['Class'], 'Methods': row['Methods'], 'Repository': row['Repository']}
                        for _, row in batch.iterrows()]
            try:
                index.upsert(vectors=list(zip(ids, embeddings, metadata)))
            except Exception as e:
                logger.error(f"Error upserting batch {i}: {e}")
                continue

        logger.info(f"Uploading completed: {index.describe_index_stats()}")


In [77]:

try:
    config = ConfigManager()
    data_trans_config = config.get_data_transformation_config()
    data_transformer = DataTransformation(data_trans_config)
    data_transformer.create_embeddings()
except Exception as e:
    raise e


[2024-10-31 11:40:46,944: INFO: common: Reading yaml file from config\config.yaml loaded suucessfully]
[2024-10-31 11:40:46,945: INFO: common: Reading yaml file from params.yaml loaded suucessfully]
[2024-10-31 11:40:46,946: INFO: common: Directory created at artifacts]

Downloaded: unixcoder.py
[2024-10-31 11:40:47,304: INFO: common: File downloaded from https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py and saved]
[2024-10-31 11:40:51,449: INFO: 3570973218: Index d4jcodebertembeds created successfully with info: {'dimension': 768,
 'index_fullness': 0.00258,
 'namespaces': {'': {'vector_count': 258}},
 'total_vector_count': 258}]
[2024-10-31 11:40:51,643: INFO: 3570973218: Dataframe shape: (258, 3)]


100%|██████████| 65/65 [00:41<00:00,  1.57it/s]

[2024-10-31 11:41:33,125: INFO: 3570973218: Uploading completed: {'dimension': 768,
 'index_fullness': 0.00258,
 'namespaces': {'': {'vector_count': 258}},
 'total_vector_count': 258}]



