In [None]:
import os
import textstat
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, LabelEncoder

from xgboost import XGBClassifier  #Does not work with sklearn version>=1.6

In [None]:
# Read the data set
full_set = pd.read_csv("../../data/answerList_data.csv")
file_path = "../../data/"
# iterate over all files in the directory and store the content
files = {}
for filename in os.listdir(file_path):
    if filename.startswith("HIT"):
        # file is a java file read the file content and store it in the dictionary
        files[filename.split(".")[0]] = open(file_path + filename, "r").read()

In [None]:
full_set.head(50)

In [None]:
# shuffle the data set
full_set = full_set.sample(frac=1, random_state=20).reset_index(drop=True)

# replace "Answer.option" with "Answer.option"
replace_dict = {"NO": 0, "YES": 1, "IDK": 2}
full_set["Answer.option"] = full_set["Answer.option"].replace(replace_dict)

In [None]:
# use textstat to calculate the Flesch reading ease score for the explanation column keep the original column
full_set["Answer.explanation_length"] = full_set["Answer.explanation"].apply(
    lambda x: len(str(x)) if pd.notnull(x) else None)
full_set["Flesch_reading_ease"] = full_set["Answer.explanation"].apply(
    lambda x: textstat.flesch_reading_ease(x) if pd.notnull(x) else None)

# drop answer.explanation column
full_set = full_set.drop(labels=["Answer.explanation"], axis=1)

In [None]:
# remove unnecessary columns
# and apply the StandardScaler to scale them. Replace the original numerical columns
full_set = full_set.drop(labels=["Answer.ID", "Question.ID", "FP", "FN", "TP", "TN", "Worker.ID"], axis=1)


In [None]:

# use standard scaler for all numerical columns
# Identify numerical columns and use StandardScaler to scale them
numerical_cols = full_set.select_dtypes(include=['float64', 'int64']).columns
# Remove the GroundTruth, Answer.explanation_length and Flesch_reading_ease  columns from the list of numerical columns
numerical_cols = numerical_cols.drop("Answer.explanation_length")
numerical_cols = numerical_cols.drop("Flesch_reading_ease")
numerical_cols = numerical_cols.drop("GroundTruth")
numerical_cols = numerical_cols.drop("Answer.option")
scaler = StandardScaler()
full_set[numerical_cols] = scaler.fit_transform(full_set[numerical_cols])


In [None]:
# Use label encoding to transform the categorical columns into numerical columns and replace the original columns, do not replace multi_cols
multi_cols = ["Worker.whereLearnedToCode", "Worker.programmingLanguage"]
label_encoder = LabelEncoder()
mapping_dict = {}
for column in full_set.columns:
    if column not in multi_cols and full_set[column].dtype == "object":
        # for FailingMethod and Worker.profession store the original values in another dictionary
        if column == "FailingMethod" or column == "Worker.profession":
            unique_cols = full_set[column].unique()
            mapping_dict[column] = {v: k for k, v in enumerate(unique_cols)}
            full_set[column] = label_encoder.fit_transform(full_set[column].astype(str))
        full_set[column] = label_encoder.fit_transform(full_set[column].astype(str))


In [None]:
# Split the multilabel column into a list of labels
multi_cols = ["Worker.whereLearnedToCode", "Worker.programmingLanguage"]
for column in multi_cols:
    # set column to lower case
    full_set[column] = full_set[column].str.lower()
    # split the column by ;
    full_set[column] = full_set[column].apply(lambda x: x.split(";") if pd.notnull(x) else None)
    # remove the spaces from the list
    for i in range(len(full_set[column])):
        if full_set[column][i] is not None:
            full_set[column][i] = [x.strip() for x in full_set[column][i]]

    # replace None with empty list
    full_set[column] = full_set[column].apply(lambda x: [] if x is None else x)
    mlb = MultiLabelBinarizer()

    # Transform the multi-selection column into a one-hot encoded DataFrame
    one_hot_encoded = pd.DataFrame(mlb.fit_transform(full_set[column]),
                                   columns=mlb.classes_,
                                   index=full_set.index)

    # Merge the one-hot encoded columns back with the original DataFrame
    full_set = pd.concat([full_set, one_hot_encoded], axis=1)
    full_set.drop(columns=[column], inplace=True)



In [None]:
non_student_set = full_set[~full_set["Worker.profession"].isin([mapping_dict["Worker.profession"]["Undergraduate_Student"]] + [mapping_dict["Worker.profession"]["Graduate_Student"]])]

professional_set = full_set[full_set["Worker.profession"].isin([mapping_dict["Worker.profession"]["Professional_Developer"]])]

hobbyist_set = full_set[full_set["Worker.profession"].isin([mapping_dict["Worker.profession"]["Hobbyist"]] + [mapping_dict["Worker.profession"]["Other"]])]

student_set = full_set[full_set["Worker.profession"].isin([mapping_dict["Worker.profession"]["Undergraduate_Student"]] + [mapping_dict["Worker.profession"]["Graduate_Student"]])]


In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, matthews_corrcoef

results = pd.DataFrame(columns=["Training set size","Precision_Holdout", "Recall_Holdout", "F1_Holdout", "MCC_Holdout"])

xgb_model = XGBClassifier(random_state=40, max_depth=1)

for n in range(2,len(non_student_set)):
    train_set = non_student_set.sample(n=n)
    Y = train_set["GroundTruth"] == train_set["Answer.option"]
    Y = Y.astype(int)
    # ensure that both answer options are present in the training set
    if sum(Y) == 0 or sum(Y) == len(Y):
        train_set = non_student_set.sample(n=n)
        Y = train_set["GroundTruth"] == train_set["Answer.option"]
        Y = Y.astype(int)

    # holdout set contains all the other samples
    holdout_set = full_set[~full_set.index.isin(train_set.index)]

    model = xgb_model
    X = train_set.drop(labels=["GroundTruth"], axis=1)
    model.fit(X, Y)

    # compute the recall, precision, f1 and matthews correlation coefficient
    y_true = holdout_set["GroundTruth"] == holdout_set["Answer.option"]
    Y_pred = model.predict(holdout_set.drop(labels=["GroundTruth"], axis=1))
    recall = recall_score(y_true, Y_pred)
    precision = precision_score(y_true, Y_pred)
    f1 = f1_score(y_true, Y_pred)
    mcc = matthews_corrcoef(y_true, Y_pred)

    results = pd.concat([results, pd.DataFrame({"Number of Non-Students":n,"Precision_Holdout": precision, "Recall_Holdout": recall, "F1_Holdout": f1, "MCC_Holdout": mcc}, index=[0])])

    if n % 20 == 0:
        print("Number of non_student_set for training: " + str(n))
    n += 1

results.to_csv("results1_2.csv")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(8, 6))

# create bins
complexity_bins = np.linspace(train_set['Flesch_reading_ease'].min(),
                              train_set['Flesch_reading_ease'].max(),
                              11)

complexity_groups = train_set.groupby(pd.cut(train_set['Flesch_reading_ease'], bins=complexity_bins))
complexity_correct = complexity_groups.apply(lambda x: (x['GroundTruth'] == x["Answer.option"]).mean() * 100)

# create plot
bars = ax.bar(range(len(complexity_correct)), complexity_correct, color='lightgreen')
ax.set_title('Distribution of Correct Labels by Reading Complexity', pad=20)
ax.set_xlabel('Flesch Reading Ease Score', labelpad=10)
ax.set_ylabel('Percentage Correct (%)', labelpad=10)

# add labels
ax.set_xticks(range(len(complexity_correct)))
ax.set_xticklabels([f'{bin.left:.1f}-{bin.right:.1f}'
                    for bin in complexity_correct.index],
                   rotation=45)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2., height + 1,
            f'{height:.1f}%',
            ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np

fig, ax = plt.subplots(figsize=(8, 6))

# create bins
length_bins = np.linspace(train_set['Answer.explanation_length'].min(),
                          train_set['Answer.explanation_length'].max(),
                          11)

length_groups = train_set.groupby(pd.cut(train_set['Answer.explanation_length'], bins=length_bins))
length_correct = length_groups.apply(lambda x: (x['GroundTruth'] == x["Answer.option"]).mean() * 100)

# create plot
bars = ax.bar(range(len(length_correct)), length_correct, color='skyblue')
ax.set_title('Distribution of Correct Labels by Explanation Length', pad=20)
ax.set_xlabel('Explanation Length (characters)', labelpad=10)
ax.set_ylabel('Percentage Correct (%)', labelpad=10)

# add labels
ax.set_xticks(range(len(length_correct)))
ax.set_xticklabels([f'{int(bin.left)}-{int(bin.right)}'
                    for bin in length_correct.index],
                   rotation=45)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2., height + 1,
            f'{height:.1f}%',
            ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
xgboost.plot_importance(xgb_model, max_num_features=10)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the data
df = results

# Create figure with subplots
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot 1: All metrics with confidence intervals
grouped_stats = df.groupby('Number of Non-Students').agg({
    'Precision_Holdout': ['mean', 'std'],
    'Recall_Holdout': ['mean', 'std'],
    'F1_Holdout': ['mean', 'std'],
    'MCC_Holdout': ['mean', 'std']
}).reset_index()

# Flatten column names
grouped_stats.columns = ['Number_of_Non_Students',
                        'Precision_mean', 'Precision_std',
                        'Recall_mean', 'Recall_std',
                        'F1_mean', 'F1_std',
                        'MCC_mean', 'MCC_std']

# Plot means and confidence intervals
metrics = [('Precision', 'blue'), ('Recall', 'red'), ('F1', 'green'), ('MCC', 'purple')]

for metric, color in metrics:
    mean_col = f'{metric}_mean'
    std_col = f'{metric}_std'

    ax1.plot(grouped_stats['Number_of_Non_Students'],
             grouped_stats[mean_col],
             label=metric,
             color=color,
             linewidth=2)

    # Add confidence intervals
    ax1.fill_between(grouped_stats['Number_of_Non_Students'],
                     grouped_stats[mean_col] - grouped_stats[std_col],
                     grouped_stats[mean_col] + grouped_stats[std_col],
                     color=color,
                     alpha=0.1)

# Add target lines from 5-fold CV
ax1.axhline(y=0.8498, color='blue', linestyle=':', alpha=0.5, label='Target Precision (0.8498)')
ax1.axhline(y=0.8806, color='red', linestyle=':', alpha=0.5, label='Target Recall (0.8806)')
ax1.axhline(y=0.8647, color='green', linestyle=':', alpha=0.5, label='Target F1 (0.8647)')

# Customize plots
ax1.set_title('Performance Metrics vs Number of Non-Students in Training Set', pad=20)
ax1.set_ylabel('Score')
ax1.grid(True, alpha=0.3)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.set_ylim(0, 1)

# Adjust layout
plt.tight_layout()

# Save the plot
plt.savefig('model_non_student_training.png', bbox_inches='tight', dpi=300)
plt.close()