In [None]:
import os
import textstat
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, LabelEncoder

from xgboost import XGBClassifier  #Does not work with sklearn version>=1.6

In [None]:
# Read the data set
train_set = pd.read_csv("../../data/answerList_data.csv")
file_path = "../../data/"
# iterate over all files in the directory and store the content
files = {}
for filename in os.listdir(file_path):
    if filename.startswith("HIT"):
        # file is a java file read the file content and store it in the dictionary
        files[filename.split(".")[0]] = open(file_path + filename, "r").read()

In [None]:
train_set.head(50)

In [None]:
# shuffle the data set
train_set = train_set.sample(frac=1, random_state=20).reset_index(drop=True)

# replace "Answer.option" with "Answer.option"
replace_dict = {"NO": 0, "YES": 1, "IDK": 2}
train_set["Answer.option"] = train_set["Answer.option"].replace(replace_dict)

In [None]:
# use textstat to calculate the Flesch reading ease score for the explanation column keep the original column
train_set["Answer.explanation_length"] = train_set["Answer.explanation"].apply(
    lambda x: len(str(x)) if pd.notnull(x) else None)
train_set["Flesch_reading_ease"] = train_set["Answer.explanation"].apply(
    lambda x: textstat.flesch_reading_ease(x) if pd.notnull(x) else None)

# drop answer.explanation column
train_set = train_set.drop(labels=["Answer.explanation"], axis=1)

In [None]:
# remove unnecessary columns
# and apply the StandardScaler to scale them. Replace the original numerical columns
train_set = train_set.drop(labels=["Answer.ID", "Question.ID", "FP", "FN", "TP", "TN", "Worker.ID"], axis=1)


In [None]:

# use standard scaler for all numerical columns
# Identify numerical columns and use StandardScaler to scale them
numerical_cols = train_set.select_dtypes(include=['float64', 'int64']).columns
# Remove the GroundTruth, Answer.explanation_length and Flesch_reading_ease  columns from the list of numerical columns
numerical_cols = numerical_cols.drop("Answer.explanation_length")
numerical_cols = numerical_cols.drop("Flesch_reading_ease")
numerical_cols = numerical_cols.drop("GroundTruth")
numerical_cols = numerical_cols.drop("Answer.option")
scaler = StandardScaler()
train_set[numerical_cols] = scaler.fit_transform(train_set[numerical_cols])


In [None]:

# Use label encoding to transform the categorical columns into numerical columns and replace the original columns, do not replace multi_cols
multi_cols = ["Worker.whereLearnedToCode", "Worker.programmingLanguage"]
label_encoder = LabelEncoder()
for column in train_set.columns:
    if column not in multi_cols and train_set[column].dtype == "object":
        # for FailingMethod store the original values in another dictionary
        if column == "FailingMethod":
            failing_methods = train_set[column].unique()
            failing_methods_dict = {i: failing_methods[i] for i in range(len(failing_methods))}
        train_set[column] = label_encoder.fit_transform(train_set[column].astype(str))



In [None]:
# Split the multilabel column into a list of labels
multi_cols = ["Worker.whereLearnedToCode", "Worker.programmingLanguage"]
for column in multi_cols:
    # set column to lower case
    train_set[column] = train_set[column].str.lower()
    # split the column by ;
    train_set[column] = train_set[column].apply(lambda x: x.split(";") if pd.notnull(x) else None)
    # remove the spaces from the list
    for i in range(len(train_set[column])):
        if train_set[column][i] is not None:
            train_set[column][i] = [x.strip() for x in train_set[column][i]]

    # replace None with empty list
    train_set[column] = train_set[column].apply(lambda x: [] if x is None else x)
    mlb = MultiLabelBinarizer()

    # Transform the multi-selection column into a one-hot encoded DataFrame
    one_hot_encoded = pd.DataFrame(mlb.fit_transform(train_set[column]),
                                   columns=mlb.classes_,
                                   index=train_set.index)

    # Merge the one-hot encoded columns back with the original DataFrame
    train_set = pd.concat([train_set, one_hot_encoded], axis=1)
    train_set.drop(columns=[column], inplace=True)



In [None]:
train_set.head(20)

In [None]:
# Initialize XGBoost classifier
xgb_model = XGBClassifier(random_state=40, max_depth=1)
xgb_model.fit(train_set.drop(labels=["GroundTruth"], axis=1), train_set["GroundTruth"] == train_set["Answer.option"])


In [None]:
from sklearn.model_selection import KFold, cross_val_score
model = xgb_model
X = train_set.drop(labels=["GroundTruth"], axis=1)
Y = train_set["GroundTruth"] == train_set["Answer.option"]

# Basic K-Fold Cross Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate different metrics
precision_scores = cross_val_score(model, X, Y, cv=kfold, scoring='precision')
recall_scores = cross_val_score(model, X, Y, cv=kfold, scoring='recall')
f1_scores = cross_val_score(model, X, Y, cv=kfold, scoring='f1')

print("K-Fold Cross Validation Results:")
print(f"Precision Scores: {precision_scores}")
print(f"Average Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std() * 2:.4f})")
print(f"\nRecall Scores: {recall_scores}")
print(f"Average Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std() * 2:.4f})")
print(f"\nF1 Scores: {f1_scores}")
print(f"Average F1: {f1_scores.mean():.4f} (+/- {f1_scores.std() * 2:.4f})")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(8, 6))

# create bins
complexity_bins = np.linspace(train_set['Flesch_reading_ease'].min(),
                              train_set['Flesch_reading_ease'].max(),
                              11)

complexity_groups = train_set.groupby(pd.cut(train_set['Flesch_reading_ease'], bins=complexity_bins))
complexity_correct = complexity_groups.apply(lambda x: (x['GroundTruth'] == x["Answer.option"]).mean() * 100)

# create plot
bars = ax.bar(range(len(complexity_correct)), complexity_correct, color='lightgreen')
ax.set_title('Distribution of Correct Labels by Reading Complexity', pad=20)
ax.set_xlabel('Flesch Reading Ease Score', labelpad=10)
ax.set_ylabel('Percentage Correct (%)', labelpad=10)

# add labels
ax.set_xticks(range(len(complexity_correct)))
ax.set_xticklabels([f'{bin.left:.1f}-{bin.right:.1f}'
                    for bin in complexity_correct.index],
                   rotation=45)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2., height + 1,
            f'{height:.1f}%',
            ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np

fig, ax = plt.subplots(figsize=(8, 6))

# create bins
length_bins = np.linspace(train_set['Answer.explanation_length'].min(),
                          train_set['Answer.explanation_length'].max(),
                          11)

length_groups = train_set.groupby(pd.cut(train_set['Answer.explanation_length'], bins=length_bins))
length_correct = length_groups.apply(lambda x: (x['GroundTruth'] == x["Answer.option"]).mean() * 100)

# create plot
bars = ax.bar(range(len(length_correct)), length_correct, color='skyblue')
ax.set_title('Distribution of Correct Labels by Explanation Length', pad=20)
ax.set_xlabel('Explanation Length (characters)', labelpad=10)
ax.set_ylabel('Percentage Correct (%)', labelpad=10)

# add labels
ax.set_xticks(range(len(length_correct)))
ax.set_xticklabels([f'{int(bin.left)}-{int(bin.right)}'
                    for bin in length_correct.index],
                   rotation=45)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2., height + 1,
            f'{height:.1f}%',
            ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
xgboost.plot_importance(xgb_model, max_num_features=10)

In [None]:
from xgboost import plot_tree
fig, ax = plt.subplots(figsize=(30, 30))
plot_tree(xgb_model, num_trees=1, ax=ax)
plt.show()