# Section 1: Pre-processing & EDA

In [2]:
!pip3 install --upgrade nltk




In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup # Used for separating text and code
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
!wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

# Reading the files
answers_df = pd.read_csv("shortend_answers_final.csv", encoding='ISO-8859-1')
merged_questions_tags_df = pd.read_csv("qns.csv",dtype={"Id": int}, encoding='ISO-8859-1')
merged_questions_tags_df = merged_questions_tags_df;

# Dropping the rows that have empty values
merged_questions_tags_df = merged_questions_tags_df.dropna(subset=['Tag'])
merged_questions_tags_df.reset_index(drop=True, inplace=True)
merged_questions_tags_df.dropna(inplace=True)

# Preprocessing functions

In [6]:
# Function Explanation: To split the body into code and text
def preprocess_data(df, body_column_name, code_column_name, body_words_column_name):
    df[code_column_name] = ''
    df[body_words_column_name] = ''


    for index, row in df.iterrows():
        row[body_column_name] = re.sub(r'[\n\t\r\f]', '', row[body_column_name])

        soup = BeautifulSoup(row[body_column_name], 'html.parser')

        code_snippets = []
        text_without_code = ''

        for pre in soup.find_all('pre'):
            code_snippets.extend(code.get_text() for code in pre.find_all('code'))
            pre.decompose()

        text_without_code = soup.get_text()

        df.at[index, code_column_name] = code_snippets
        df.at[index, body_words_column_name] = text_without_code

    return df

# Function Explanation: To transform it all into lower case and remove stopw words
def preprocessing_task2(text):

  text = text.lower()

  words = word_tokenize(text)

  stop_words = set(stopwords.words('english'))

  filtered_words = [word for word in words if word not in stop_words]

  stemmed_words = [stemmer.stem(word) for word in words]

  stemmed_text = ' '.join(stemmed_words)

  return stemmed_text

# Preparing the dataframe for the training and testing

In [7]:
# Rename the column in df
merged_questions_tags_df = merged_questions_tags_df.rename(columns={'Id': 'question_id','code_snippets':'code_snippets_question','weight_title_body_preprocessed_task2': 'weighted_title_body_question'})
merged_questions_tags_df = merged_questions_tags_df.drop(columns=['CreationDate','Score','Body','Tag','body_words_preprocessed_task1','title_preprocessed_task1','weight_title_body_preprocessed_task1','body_words_preprocessed_task2','title_preprocessed_task2'])

# Merge answer df and question_tag df
df = pd.merge(merged_questions_tags_df,
              answers_df,
              how='inner',
              left_on='question_id',
              right_on='ParentId',)

# Rename the columns after merging with answer df
df.rename(columns={"body_words":"body_words_question","Body": "body_answer","Id":"answer_id","Score":"score_answer"},inplace= True )
df.drop(columns=['Title','ParentId',"CreationDate"],inplace=True)
df.rename(columns= {}, inplace=True)

# Preprocess the answer columns
stemmer = SnowballStemmer("english")
df = preprocess_data(df, "body_answer", "code_snippets_answer", "body_words_answer")
df["body_words_answer_preprocess"] = \
df["body_words_answer"].apply(preprocessing_task2)

In [8]:
df.head()

Unnamed: 0,question_id,code_snippets_question,body_words_question,weighted_title_body_question,answer_id,OwnerUserId,score_answer,body_answer,code_snippets_answer,body_words_answer,body_words_answer_preprocess
0,19280,[],I could only find the function confirm() that ...,code to ask yes/no question in javascript code...,19284,1965.0,2,<p>No.</p>\n\n<p>Instead you could use a in br...,[],No.Instead you could use a in browser modal po...,no.instead you could use a in browser modal po...
1,19280,[],I could only find the function confirm() that ...,code to ask yes/no question in javascript code...,19290,565.0,6,"<p>Javascript offers 3 modal boxes. Prompt, c...",[],"Javascript offers 3 modal boxes. Prompt, conf...","javascript offer 3 modal box . prompt , confir..."
2,19280,[],I could only find the function confirm() that ...,code to ask yes/no question in javascript code...,19293,1377.0,1,"<p>No, but there are JavaScript libraries that...",[],"No, but there are JavaScript libraries that ca...","no , but there are javascript librari that can..."
3,19280,[],I could only find the function confirm() that ...,code to ask yes/no question in javascript code...,19307,1414.0,2,"<p>Like everyone else above says, you're stuck...",[],"Like everyone else above says, you're stuck wi...","like everyon els abov say , you re stuck with ..."
4,19280,[],I could only find the function confirm() that ...,code to ask yes/no question in javascript code...,22914957,1139830.0,0,"<p>I'm a fan of <a href=""https://jqueryui.com/...","[<script> $(function() { $( ""#dialog-confi...",I'm a fan of jQuery UI Dialog for this sort of...,i 'm a fan of jqueri ui dialog for this sort o...


In [9]:
# To understand how to split the scoring
q1 = df['score_answer'].quantile(0.25)
q2 = df['score_answer'].quantile(0.5)
q3 = df['score_answer'].quantile(0.75)
print("Q1:", q1)
print("Q2 (median):", q2)
print("Q3:", q3)

Q1: 0.0
Q2 (median): 1.0
Q3: 2.0


# Feature engineering to include the hand crafted features

In [10]:
# Add presence/ absent of code
def presence_of_code_answer(code_snippets):
    if len(code_snippets) > 0:
        return 1
    else:
        return 0

# Apply the function to create the new column "presence_of_code_answer"
df['presence_of_code_answer'] = df['code_snippets_answer'].apply(lambda x: presence_of_code_answer(x))

# length of answer: in terms of words not letters
df['answer_length'] = df['body_words_answer'].apply(lambda x: len(x.split()))

# Convert the scores into bins
def map_to_category(score):
    if score <= q1:
        return 0  # Low
    elif score <= q2:
        return 1  # Medium
    else:
        return 2  # High

df['score_category'] = df['score_answer'].apply(lambda x: map_to_category(x))


In [11]:
df.shape

(126046, 14)

In [12]:
# !pip3 install transformers

In [13]:
# !pip3 install torch

In [14]:
# !pip3 install pytorch

In [15]:
# !pip3 install torch torchvision torchaudio

In [16]:
############# Preparing the similarity score for  Approach 1 ###########################################

# Convert the answers into TF-df
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Vectorize answers
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(df['body_words_answer_preprocess'])

# Vectorize the queries
queries_tfidf_matrix = vectorizer.transform(df['weighted_title_body_question'])

# Iterate over each row in the DataFrame & Calculate the similarity scores between each answer and each query
for i in range(len(df)):
    answer_tfidf_vector = tfidf_matrix[i]
    query_tfidf_vector = queries_tfidf_matrix[i]

    similarity_score = cosine_similarity(answer_tfidf_vector, query_tfidf_vector)[0][0]
    df.at[i, "cosine_similarity"] = similarity_score

# Display the DataFrame
df.head()

############# Preparing the embeddings for  Approach 2 ###########################################
# Bert embeding for the answer and query
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

df['bert_embeddings_answers'] = df['body_words_answer_preprocess'].apply(get_bert_embeddings)
df['bert_embeddings_questions'] = df['weighted_title_body_question'].apply(get_bert_embeddings)

# Code embedding for the answers and the query
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval
import torch

from unixcoder import UniXcoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UniXcoder("microsoft/unixcoder-base-nine")
model.to(device)

def get_embeddings(text):
  tokens_ids = model.tokenize([text],max_length=512,mode="<encoder-only>")
  source_ids = torch.tensor(tokens_ids).to(device)
  tokens_embeddings,nl_embedding = model(source_ids)
  norm_nl_embedding = torch.nn.functional.normalize(nl_embedding, p=2, dim=1)
  norm_nl_embedding = norm_nl_embedding.detach().cpu().numpy()[0]
  return norm_nl_embedding

##########################################################################
code_embeddings_q = []

for code in df['code_snippets_question']:
  vector_database = []
  if code != []:
    for i in range(len(code)): #if there are multiple chunks of codes in one question
      vector_database.append(get_embeddings(code[i]))
  code_embeddings_q.append(vector_database)

df['code_snippets_question'] = code_embeddings_q

max_length = max(len(embedding) for embedding in df['code_snippets_question'])
max_inner_length = max(len(inner_array) for embedding in df['code_snippets_question'] for inner_array in embedding)
flattened_max_length = max_length * max_inner_length


##########################################################################
padded_embeddings_q = []

for i, row in df.iterrows():
    embedding = row['code_snippets_question']
    if embedding != []:  # Check if the embedding list is not empty
        padded_inner_arrays = [np.pad(inner_array, (0, max_inner_length - len(inner_array)), 'constant', constant_values=0) for inner_array in embedding]
        flattened_embedding = np.concatenate(padded_inner_arrays).ravel()
    else:
        flattened_embedding = np.zeros(flattened_max_length)

    final_padded_embedding = np.pad(flattened_embedding, (0, flattened_max_length - len(flattened_embedding)), 'constant', constant_values=0)
    padded_embeddings_q.append(final_padded_embedding)

code_embeddings_q = np.array(padded_embeddings_q)
code_embeddings_q = code_embeddings_q.reshape(len(code_embeddings_q), -1)

df['code_embeddings_padded_question'] = code_embeddings_q.tolist()

df.head()

# Code embedding for the answers and the query

##########################################################################

code_embeddings_a = []

for code in df['code_snippets_answer']:
  vector_database = []
  if code != []:
    for i in range(len(code)): #if there are multiple chunks of codes in one question
      vector_database.append(get_embeddings(code[i]))
  code_embeddings_a.append(vector_database)

df['code_snippets_answer'] = code_embeddings_a

max_length = max(len(embedding) for embedding in df['code_snippets_answer'])
max_inner_length = max(len(inner_array) for embedding in df['code_snippets_answer'] for inner_array in embedding)
flattened_max_length = max_length * max_inner_length


##########################################################################

padded_embeddings_a = []

for i, row in df.iterrows():
    embedding = row['code_snippets_answer']
    if embedding != []:  # Check if the embedding list is not empty
        padded_inner_arrays = [np.pad(inner_array, (0, max_inner_length - len(inner_array)), 'constant', constant_values=0) for inner_array in embedding]
        flattened_embedding = np.concatenate(padded_inner_arrays).ravel()
    else:
        flattened_embedding = np.zeros(flattened_max_length)

    final_padded_embedding = np.pad(flattened_embedding, (0, flattened_max_length - len(flattened_embedding)), 'constant', constant_values=0)
    padded_embeddings_a.append(final_padded_embedding)

code_embeddings_a = np.array(padded_embeddings_a)
code_embeddings_a = code_embeddings_a.reshape(len(code_embeddings_a), -1)

df['code_embeddings_padded_answers'] = code_embeddings_a.tolist()


df.head()



############# Split the data into training and test dataframe #########################

# To Split the data into training and test dataframe
split_index = int(len(df) * 0.7)  # 70% for training, 30% for testing

# Split the DataFrame into training and testing sets
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
test_df.head()

In [None]:
print(test_df.shape)
print(train_df.shape)

# Final solution: Approach 1

## Training the model

In [None]:
################## Parsing into the Approach 1 model #############################
from keras.layers import Input, Concatenate, Dense, Reshape, Flatten
from keras.models import Model
import numpy as np

# Define input data: similarity_scores_query_question = df['similarity_scores_query_question'].values --> to take from DF
similarity_scores_query_answer = np.array(train_df["similarity_scores_query_answer"].values)
code_presence = np.array(train_df["presence_of_code_answer"].values)
labels = np.array(train_df["score_category"].values)  # 0: Low, 1: Med, 2: High
num_classes = 3

# Define input layers
input1 = Input(shape=(1,))
input2 = Input(shape=(1,))

# Concatenate inputs if needed
concatenated_inputs = Concatenate()([input1, input2])

# Additional layers
dense1 = Dense(64, activation='relu')(concatenated_inputs)
dense2 = Dense(32, activation='relu')(dense1)
output = Dense(num_classes, activation='softmax')(dense2)

# Define the model
model = Model(inputs=[input1, input2], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x=[similarity_scores_query_answer, code_presence],
          y=labels,
          epochs=10,
          batch_size=32)

# Plot the graph
import matplotlib.pyplot as plt

# Plot the accuracy
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train'], loc='upper left')
plt.show()


## Testing the model

In [None]:
test_similarity_scores_query_answer = np.array(test_df["similarity_scores_query_answer"].values)
test_code_presence = np.array(test_df["presence_of_code_answer"].values)
test_labels = np.array(test_df["score_category"].values)  # 0: Low, 1: Med, 2: High

loss, accuracy = model.evaluate(x=[test_similarity_scores_query_answer, test_code_presence], y=test_labels)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

######### Plotting the graph #############################################
# Plot the loss and accuracy
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.title('Training Loss and Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plt.show()



## Receive the query from xf's part and return the prediction ---- havent not done (Need to run and edit)

In [None]:
################################## ToDo : edit this part and link with xf's portion ###################
# Get the input data for prediction
query_id = random.choice(df['question_id'].tolist())
query = df[df["question_id"] == query_id]["weighted_title_body_question"].values[0]

# Convert query into TF-IDF vector
query_tfidf_vector = vectorizer.transform([query])

# Calculate similarity scores between query and answers
similarity_scores_query_answer = cosine_similarity(tfidf_matrix, query_tfidf_vector)

# Reshape the data to match model input shape
similarity_scores_query_answer = similarity_scores_query_answer.reshape(-1, 1)

# Make prediction using the model
predictions = model.predict([similarity_scores_query_answer, code_presence])

# Get the predicted class (index with highest probability)
predicted_class_index = np.argmax(predictions)

# Map predicted class index to actual score category (e.g., "Low", "Medium", "High")
score_category_map = {0: "Low", 1: "Medium", 2: "High"}
predicted_score_category = score_category_map[predicted_class_index]

# Print the prediction
print("Predicted Score Category:", predicted_score_category)


# Final Solution Approach 2: Code embeddings

## Training the model -- havent edit

In [None]:
# Using embeddings and allowing the NN model to learn -----------------------------------------> To edit this
from keras.layers import Input, Concatenate, Dense
from keras.models import Model
from tensorflow.keras.layers import Reshape
import numpy as np

query_bert_embeddings = np.array(train_df["bert_embeddings_questions"].values.tolist())
query_code_embeddings = np.array(train_df["code_embeddings_padded_question"].values.tolist())
answer_bert_embeddings = np.array(train_df["bert_embeddings_answers"].values.tolist())
answer_code_embeddings = np.array(train_df["code_embeddings_padded_answers"].values.tolist())
labels = np.array(train_df["score_category"].values)

# Define input shapes
query_bert_input_shape = query_bert_embeddings.shape[1:]
query_code_input_shape = query_code_embeddings.shape[1:]
answer_bert_input_shape = answer_bert_embeddings.shape[1:]
answer_code_input_shape = answer_code_embeddings.shape[1:]

# Define input layers
query_bert_input = Input(shape=query_bert_input_shape)
query_code_input = Input(shape=query_code_input_shape)
answer_bert_input = Input(shape=answer_bert_input_shape)
answer_code_input = Input(shape=answer_code_input_shape)

# Reshape inputs if necessary
query_bert_input_reshaped = Reshape((-1,))(query_bert_input)
answer_bert_input_reshaped = Reshape((-1,))(answer_bert_input)

# Concatenate query and answer embeddings
concatenated_inputs = Concatenate()([query_bert_input_reshaped, query_code_input, answer_bert_input_reshaped, answer_code_input])

# Additional layers
dense1 = Dense(64, activation='relu')(concatenated_inputs)
dense2 = Dense(32, activation='relu')(dense1)
output = Dense(3, activation='softmax')(dense2)  # Assuming 3 output classes: "low", "medium", "high"

# Define the model
model = Model(inputs=[query_bert_input, query_code_input, answer_bert_input, answer_code_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x=[query_bert_embeddings, query_code_embeddings, answer_bert_embeddings, answer_code_embeddings],
                    y=labels,
                    epochs=10,
                    batch_size=32)

# Plot the accuracy
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train'], loc='upper left')
plt.show()


## Testing the model -- havent edit

In [None]:
test_df.head()

In [None]:
# Assuming you already have your test data prepared in similar format
test_query_bert_embeddings = np.array(test_df["bert_embeddings_questions"].values.tolist())
test_query_code_embeddings = np.array(test_df["code_embeddings_padded_question"].values.tolist())
test_answer_bert_embeddings = np.array(test_df["bert_embeddings_answers"].values.tolist())
test_answer_code_embeddings = np.array(test_df["code_embeddings_padded_answers"].values.tolist())
test_labels = np.array(test_df["score_category"].values)

# Evaluate the model on test data
evaluation_metrics = model.evaluate(x=[test_query_bert_embeddings, test_query_code_embeddings, test_answer_bert_embeddings, test_answer_code_embeddings],
                                    y=test_labels)

# Print evaluation metrics
print("Test Loss:", evaluation_metrics[0])
print("Test Accuracy:", evaluation_metrics[1])

# Extracting test loss and accuracy
test_loss = evaluation_metrics[0]
test_accuracy = evaluation_metrics[1]

# Print evaluation metrics
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

# Plotting the evaluation metrics
plt.plot(test_loss, label='Test Loss', color='red', marker='o')
plt.plot(test_accuracy, label='Test Accuracy', color='blue', marker='o')
plt.title('Test Loss and Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plt.show()
