# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import sys

from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer

  _torch_pytree._register_pytree_node(


# Importing loading_data class from data_loader.py and evaluation class from model_evaluation.py  in the src folder

In [2]:
# Retrieving the current working directory 
current_directory = os.getcwd()

# getting the parent directory of the current working directory
parent_directory = os.path.dirname(current_directory)

# Constructing a path to the "Data" directory located inside a directory named "src"
src_data_directory = os.path.join(parent_directory, "src", "Data")

src_model_directory = os.path.join(parent_directory, "src", "model")

# allowing Python to search for modules in this directory.
sys.path.append(src_data_directory)
sys.path.append(src_model_directory)

#Importing the data_preprocessor class from the data_cleaner module located in the src -> Data.
from data_loader import loading_data

#Importing the evaluation class from the model_evaluation module located in the src -> model.
from model_evaluation import evaluation

# Creating an object of loading_data class

In [3]:
load_object=loading_data()

#checking object is created and getting the address of it
load_object

<data_loader.loading_data at 0x15417ec90>

# Creating an object of evaluation class

In [4]:
eval_object=evaluation()

#checking object is created and getting the address of it
eval_object

<model_evaluation.evaluation at 0x154221150>

# Loading all Embeddings  from Models folder using loading_data class object

In [5]:
#getting the path where we have saved the embeddings of model_1
model_1_path =load_object.get_file_path("embeddings_all_mpnet_base_v2.npy","Models")

#Loading the embeddings of model_1 in notebook
embeddings_model_1 = np.load(model_1_path)

#getting the path where we have saved the embeddings of model_2
model_2_path =load_object.get_file_path("embeddings_multilingual_e5_large_instruct.npy","Models")

#Loading the embeddings of model_2 in notebook
embeddings_model_2 = np.load(model_2_path)

#getting the path where we have saved the embeddings of model_3
model_3_path =load_object.get_file_path("embeddings_intfloat_e5_base_v2.npy","Models")

#Loading the embeddings of model_3 in notebook
embeddings_model_3 = np.load(model_3_path)

#getting the path where we have saved the embeddings of model_4
model_4_path =load_object.get_file_path("embeddings_mixedbread_ai_mxbai_embed_2d_large_v1.npy","Models")

#Loading the embeddings of model_4 in notebook
embeddings_model_4 = np.load(model_4_path)

# Loading all the models

In [6]:
model_1 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

model_2 = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

model_3 = SentenceTransformer('intfloat/e5-base-v2')

model_4=SentenceTransformer("mixedbread-ai/mxbai-embed-2d-large-v1")

  _torch_pytree._register_pytree_node(


# Loading the processed_data and converting into the list

In [7]:
# getting the path of processed data
processed_file_path=load_object.get_file_path("processed_data.csv","Data")

# reading the processed_data
df=pd.read_csv(processed_file_path)

# converting the tokenized_docstring into list
list_data = df['tokenized_docstring'].tolist()

# Loading the testing data

In [8]:
query_path=load_object.get_file_path("query.csv","Data","testing_data")
queries=pd.read_csv(query_path)


In [9]:
len(queries)

57

# Key for using Claude API

In [10]:
# key will be provided by the you here for using the claude api
key='Claude_api_key'

# function for checking the response form the Claude 

In [11]:
def check_response(Questions, top_match_code):
    
    # converting the top_match_code in the dataframe
    data=pd.DataFrame(top_match_code)
    
    # Questions is the response from the human for the claude
    human =Questions
    
    # Initialize the ChatAnthropic object
    chat = ChatAnthropic(anthropic_api_key=key ,temperature=0, model_name="claude-3-opus-20240229")

    # Defining system message with task description and data
    system = (
    """ Your task is to provide a response of only 'YES' if there is a 75 percentage matching of human input in the data,
        or only 'No' if there isn't,
        when comparing the data to human input.
        
    data: {data}
    human: {human}
    """
    )
    
    # Creating the ChatPromptTemplate
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human",human)])

    # Creating the chain combining prompt and chat
    chain = prompt | chat
    
    # Invoking the chain with data and human input
    response=chain.invoke(
    {
         "data": data,
        "human": human,
    }
    )
    return response
    

# Evaluting the model_1

In [22]:
# Taking count of how many yes is generated by claude api for embeddings of model_1
yes_model_1=0

for i in range(len(queries)):
    #getting the query 
    query=queries.loc[i].Questions
    
    # getting the top 10 code that matches with embeddings of query and embeddings of docstring of model
    top_10_code=eval_object.get_top_10_code(query,embeddings_model_1,model_1,list_data,df)
    print(i+1)
    
    # printing the reponse from the claude api using the key
    print(check_response(query,top_10_code))
    
    # increasing the count only if the response from the claude is "YES"
    if 'YES' in check_response(query,top_10_code).content:
        yes_model_1+=1

1
content='YES'
2
content='YES'
3
content='YES'
4
content='YES'
5
content='YES'
6
content='No'
7
content='No'
8
content='YES'
9
content='NO'
10
content='YES'
11
content='YES'
12
content='YES'
13
content='NO'
14
content='NO'
15
content='YES'
16
content='YES'
17
content='YES'
18
content='No'
19
content='YES'
20
content='YES'
21
content='YES'
22
content='YES'
23
content='YES'
24
content='NO'
25
content='YES'
26
content='YES'
27
content='YES'
28
content='YES'
29
content='YES'
30
content='YES'
31
content='YES'
32
content='YES'
33
content='YES'
34
content='YES'
35
content='YES'
36
content='YES'
37
content='YES'
38
content='YES'
39
content='YES'
40
content='YES'
41
content='YES'
42
content='YES'
43
content='YES'
44
content='YES'
45
content='YES'
46
content='YES'
47
content='YES'
48
content='YES'
49
content='No'
50
content='No'
51
content='YES'
52
content='YES'
53
content='YES'
54
content='YES'
55
content='NO'
56
content='YES'
57
content='YES'


# Evaluating the model_2

In [23]:
# Taking count of how many yes is generated by claude api for embeddings of model_2
yes_model_2=0

for i in range(len(queries)):
    #getting the query 
    query=queries.loc[i].Questions
    
    # getting the top 10 code that matches with embeddings of query and embeddings of docstring of model
    top_10_code=eval_object.get_top_10_code(query,embeddings_model_2,model_2,list_data,df)
    
    print(i+1)
    
    # printing the reponse from the claude api using the key
    print(check_response(query,top_10_code))
    
    # increasing the count only if the response from the claude is "YES"
    if 'YES' in check_response(query,top_10_code).content:
        yes_model_2+=1

1
content='No'
2
content='YES'
3
content='YES'
4
content='No'
5
content='YES'
6
content='YES'
7
content='No'
8
content='YES'
9
content='NO'
10
content='NO'
11
content='YES'
12
content='YES'
13
content='NO'
14
content='No'
15
content='YES'
16
content='YES'
17
content='YES'
18
content='YES'
19
content='YES'
20
content='No'
21
content='YES'
22
content='YES'
23
content='No'
24
content='YES'
25
content='YES'
26
content='YES'
27
content='No'
28
content='YES'
29
content='YES'
30
content='YES'
31
content='YES'
32
content='YES'
33
content='YES'
34
content='YES'
35
content='YES'
36
content='YES'
37
content='YES'
38
content='YES'
39
content='YES'
40
content='YES'
41
content='YES'
42
content='YES'
43
content='YES'
44
content='YES'
45
content='YES'
46
content='YES'
47
content='No'
48
content='YES'
49
content='No'
50
content='No'
51
content='YES'
52
content='YES'
53
content='YES'
54
content='YES'
55
content='NO'
56
content='YES'
57
content='YES'


# Evaluating the model_3

In [24]:
# Taking count of how many yes is generated by claude api for embeddings of model_3
yes_model_3=0

for i in range(len(queries)):
    #getting the query
    query=queries.loc[i].Questions
    
    # getting the top 10 code that matches with embeddings of query and embeddings of docstring of model
    top_10_code=eval_object.get_top_10_code(query,embeddings_model_3,model_3,list_data,df)
    print(i+1)
    
    # printing the reponse from the claude api using the key
    print(check_response(query,top_10_code))
    
    # increasing the count only if the response from the claude is "YES"
    if 'YES' in check_response(query,top_10_code).content:
        yes_model_3+=1

1
content='YES'
2
content='YES'
3
content='YES'
4
content='No'
5
content='YES'
6
content='No'
7
content='No'
8
content='YES'
9
content='NO'
10
content='YES'
11
content='YES'
12
content='YES'
13
content='NO'
14
content='NO'
15
content='YES'
16
content='YES'
17
content='YES'
18
content='NO'
19
content='YES'
20
content='YES'
21
content='YES'
22
content='YES'
23
content='YES'
24
content='YES'
25
content='YES'
26
content='YES'
27
content='YES'
28
content='YES'
29
content='NO'
30
content='YES'
31
content='YES'
32
content='YES'
33
content='YES'
34
content='YES'
35
content='YES'
36
content='YES'
37
content='YES'
38
content='YES'
39
content='YES'
40
content='YES'
41
content='YES'
42
content='YES'
43
content='YES'
44
content='YES'
45
content='YES'
46
content='YES'
47
content='YES'
48
content="Here is a Python function to detect if a file is in CSV format:\n\ndef is_csv_file(file_path):\n    try:\n        with open(file_path, 'r') as file:\n            sample = file.read(1024)  # Read a sample of

# Evaluating the model_4


In [25]:
# Taking count of how many yes is generated by claude api for embeddings of model_4
yes_model_4=0
for i in range(len(queries)):
    #getting the query
    query=queries.loc[i].Questions
    
    # getting the top 10 code that matches with embeddings of query and embeddings of docstring of model
    top_10_code=eval_object.get_top_10_code(query,embeddings_model_4,model_4,list_data,df)
    print(i+1)
    
    # printing the reponse from the claude api using the key
    print(check_response(query,top_10_code))
    
    # increasing the count only if the response from the claude is "YES"
    if 'YES' in check_response(query,top_10_code).content:
        yes_model_4+=1

1
content='YES'
2
content='YES'
3
content='YES'
4
content='YES'
5
content='YES'
6
content='YES'
7
content='YES'
8
content='YES'
9
content='NO'
10
content='YES'
11
content='No'
12
content='YES'
13
content='YES'
14
content='No'
15
content='YES'
16
content='YES'
17
content='YES'
18
content='YES'
19
content='YES'
20
content='YES'
21
content='YES'
22
content='YES'
23
content='No'
24
content='YES'
25
content='YES'
26
content='YES'
27
content='YES'
28
content='YES'
29
content='YES'
30
content='YES'
31
content='YES'
32
content='NO'
33
content='YES'
34
content='YES'
35
content='YES'
36
content='YES'
37
content='YES'
38
content='YES'
39
content='NO'
40
content='YES'
41
content='YES'
42
content='YES'
43
content='YES'
44
content='YES'
45
content='YES'
46
content='YES'
47
content='YES'
48
content='YES'
49
content='YES'
50
content='No'
51
content='YES'
52
content='YES'
53
content='YES'
54
content='YES'
55
content='NO'
56
content='YES'
57
content='YES'


In [26]:
print(yes_model_1)
print(yes_model_2)
print(yes_model_3)
print(yes_model_4)

45
43
45
50


# Accuracy for model_1

In [27]:
print(f"Accuracy of model_1 : {(yes_model_1/len(queries))*100}")

Accuracy of model_1 : 78.94736842105263


# Accuracy for model_2

In [28]:
print(f"Accuracy of model_2 : {(yes_model_2/len(queries))*100}")

Accuracy of model_2 : 75.43859649122807


# Accuracy for model_3

In [29]:
print(f"Accuracy of model_3 : {(yes_model_3/len(queries))*100}")

Accuracy of model_3 : 78.94736842105263


# Accuracy for model_4

In [30]:
print(f"Accuracy of model_4 : {(yes_model_4/len(queries))*100}")

Accuracy of model_4 : 87.71929824561403
