In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, classification_report

def calculate_explanation_complexity(answer):
  if isinstance(answer, str):
    unique_words = set(answer.split(' '))
    word_count = len(answer.split(' '))
    return len(unique_words) / word_count
  return 0

def calculate_explanation_size(answer):
  if isinstance(answer, str):
    return len(answer)
  return 0

# Input: participant score and profession, duration, explanation size/complexity, confidence and difficulty
# Output: label answer as correct or incorrect

data = pd.read_csv('data.csv')

data['Answer.complexity'] = data['Answer.explanation'].apply(calculate_explanation_complexity)
data['Answer.size'] = data['Answer.explanation'].apply(calculate_explanation_size)

encoder = LabelEncoder()
data['Worker.profession_encoded'] =  encoder.fit_transform(data['Worker.profession'])

student = ['Undergraduate_Student', 'Graduate_Student']
non_student = ['Professional_Developer', 'Hobbyist', 'Other']

student_set = data[data['Worker.profession'].isin(student)]


#train = student_set[student_set['Worker.profession'] == 'Undergraduate_Student']
#holdout = student_set[student_set['Worker.profession'] == 'Graduate_Student']

train, holdout = train_test_split(student_set, test_size=0.3, random_state=42)


X_train = train[['Worker.score', 'Worker.profession_encoded', 'Answer.duration', 'Answer.size', 'Answer.complexity', 'Answer.confidence', 'Answer.difficulty']]
y_train = train['GroundTruth']

X_holdout = holdout[['Worker.score', 'Worker.profession_encoded', 'Answer.duration', 'Answer.size', 'Answer.complexity', 'Answer.confidence', 'Answer.difficulty']]
y_holdout = holdout['GroundTruth']

clf = RandomForestClassifier(random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

print(f"Accuracy score: ", clf.score(X_holdout, y_holdout))

y_predict = clf.predict(X_holdout)
initial_precision = precision_score(y_holdout, y_predict)
initial_recall = recall_score(y_holdout, y_predict)

print("Initial:")
print(f"Precision: {initial_precision}")
print(f"Recall: {initial_recall}")

non_student_set = data[data['Worker.profession'].isin(non_student)]

thresholds = {
    'precision_5': None,
    'precision_10': None,
    'recall_5': None,
    'recall_10': None
}

for i in range(1, len(non_student_set) + 1):
    sampled_non_students = non_student_set.sample(i, random_state=42)
    new_holdout = pd.concat([holdout, sampled_non_students], ignore_index=True)

    X_holdout = new_holdout[['Worker.score', 'Worker.profession_encoded', 'Answer.duration', 'Answer.size', 'Answer.complexity', 'Answer.confidence', 'Answer.difficulty']]
    y_holdout = new_holdout['GroundTruth']

    y_predict = clf.predict(X_holdout)

    current_precision = precision_score(y_holdout, y_predict)
    current_recall = recall_score(y_holdout, y_predict)

    if thresholds['precision_5'] is None and (initial_precision - current_precision) / initial_precision >= 0.05:
        thresholds['precision_5'] = i
    if thresholds['precision_10'] is None and (initial_precision - current_precision) / initial_precision >= 0.10:
        thresholds['precision_10'] = i
    if thresholds['recall_5'] is None and (initial_recall - current_recall) / initial_recall >= 0.05:
        thresholds['recall_5'] = i
    if thresholds['recall_10'] is None and (initial_recall - current_recall) / initial_recall >= 0.10:
        thresholds['recall_10'] = i

    if all(value is not None for value in thresholds.values()):
        break

avg_drop_5 = (thresholds['precision_5'] + thresholds['recall_5']) / 2 if thresholds['precision_5'] and thresholds['recall_5'] else None
avg_drop_10 = (thresholds['precision_10'] + thresholds['recall_10']) / 2 if thresholds['precision_10'] and thresholds['recall_10'] else None

print("\nAverage Number of Non-Students Added:")
if avg_drop_5 is not None:
    print(f"Average for 5% drop: {avg_drop_5}")
else:
    print("5% drop not observed for both metrics.")

if avg_drop_10 is not None:
    print(f"Average for 10% drop: {avg_drop_10}")
else:
    print("10% drop not observed for both metrics.")



Accuracy score:  0.8065843621399177
Initial:
Precision: 1.0
Recall: 0.04081632653061224

Average Number of Non-Students Added:
Average for 5% drop: 96.0
Average for 10% drop: 106.0


In [2]:
# Previous model recall and precisions:
old_model_precision = 0.9711286089238845
old_model_recall = 0.8809523809523809

min_non_student_size = 0

for i in range(1, len(non_student_set) + 1):
    sampled_non_students = non_student_set.sample(i, random_state=42)
    new_holdout = pd.concat([holdout, sampled_non_students], ignore_index=True)

    X_holdout = new_holdout[['Worker.score', 'Worker.profession_encoded', 'Answer.duration', 'Answer.size', 'Answer.complexity', 'Answer.confidence', 'Answer.difficulty']]
    y_holdout = new_holdout['GroundTruth']

    y_predict = clf.predict(X_holdout)

    current_precision = precision_score(y_holdout, y_predict)
    current_recall = recall_score(y_holdout, y_predict)

    if abs(current_precision - old_model_precision) / old_model_precision <= 0.2 and abs(current_recall - old_model_recall) / old_model_recall <= 0.2:
        min_non_student_size = i
        break

print("\nMinimum Non-Student Size to have same outcome with old model:", min_non_student_size)



Minimum Non-Student Size to have same outcome with old model: 0
