In [32]:
from glob import glob
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
import numpy as np # type: ignore

In [25]:
ANSWERS_FILE = "./data/answerList_data.csv"
CODE_FILES = glob("./data/*.java")

In [73]:
df = pd.read_csv(ANSWERS_FILE)
df = df.drop(columns=['Answer.ID', 'Question.ID', 'Worker.ID'])
df

Unnamed: 0,FailingMethod,Answer.duration,Answer.confidence,Answer.difficulty,GroundTruth,TP,TN,FN,FP,Answer.option,...,Code.LOC,Code.complexity,Worker.score,Worker.profession,Worker.yearsOfExperience,Worker.age,Worker.gender,Worker.whereLearnedToCode,Worker.country,Worker.programmingLanguage
0,HIT01_8,90.984,4,2,0,0,1,0,0,NO,...,1,1,5,Undergraduate_Student,7.0,21,Male,High School,United States,Java; C++; C#
1,HIT01_8,133.711,5,1,0,0,1,0,0,NO,...,1,1,4,Undergraduate_Student,10.0,25,Female,High School;University;Web,United States,c#
2,HIT01_8,77.696,5,2,0,0,1,0,0,NO,...,1,1,5,Professional_Developer,7.0,24,Male,High School;University;Web,United States,C++;Java;PHP
3,HIT01_8,46.644,1,1,0,0,1,0,0,NO,...,1,1,5,Professional_Developer,20.0,38,Male,High School;University;Other On the job,USA,C#
4,HIT01_8,215.416,5,5,0,0,1,0,0,NO,...,1,1,3,Undergraduate_Student,5.0,19,Male,High School;University;Web,United States,C++; Java
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,HIT08_54,220.420,2,4,0,0,1,0,0,NO,...,1,1,3,Graduate_Student,2.0,28,Male,University,USA,C#
2576,HIT08_54,322.790,4,3,0,0,0,0,1,YES,...,1,1,5,Professional_Developer,17.0,39,Male,University;Web,USA,C#; VB.NET; Java
2577,HIT08_54,159.530,5,1,0,0,1,0,0,NO,...,1,1,4,Professional_Developer,10.0,31,Male,High School;University;Web;Other work,usa,C++
2578,HIT08_54,68.578,5,1,0,0,1,0,0,NO,...,1,1,4,Undergraduate_Student,4.0,19,Male,University;Web;Other FIRST Robotics,United States,C++


## Average Explanation Length

In [74]:
explanations = [str(v) for v in df['Answer.explanation'].tolist()]
explanation_median_length = np.mean([len(v) for v in explanations])
print(f'Median Length of Explanations: {explanation_median_length}')

Median Length of Explanations: 136.56046511627906


## Explanation Complexity

In [84]:
def number_of_words(expl):
    return len(expl.split(" "))

def calculate_complexity(explanations):
    method = number_of_words

    return [method(expl) for expl in explanations]

explanation_complexities = calculate_complexity(explanations)
df['Answer.explanationComplexity'] = explanation_complexities

y

In [87]:
jobs = df['Worker.profession'].unique().tolist()
genders = df['Worker.gender'].unique().tolist()
countries = df['Worker.country'].unique().tolist()
answer = df['Answer.option'].unique().tolist()

def method_to_int(method_name):
    method_num = method_name.split("_")[-1]

    try:
        return int(method_num)
    except ValueError:
        return 0

def score_for_where_learned_to_code(where_learned_to_code):
    score = 0
    if 'University' in where_learned_to_code:
        score += 1000
    if 'High School' in where_learned_to_code:
        score += 100
    if 'Web' in where_learned_to_code:
        score += 50
    if 'Books' in where_learned_to_code:
        score += 100
    return score

def score_for_programming_languages(pls):
    known_languages = ['java', 'python', 'javascript', 'c', 'c++', 'c#', 'ruby', 'php', 'sql', 'html', 'css', 'xml', 'json', 'bash', 'shell', 'perl', 'go', 'rust', 'kotlin', 'swift', 'objective-c', 'scala', 'haskell', 'erlang', 'elixir', 'clojure', 'groovy', 'lisp', 'prolog', 'r']

    score = 0
    for pl in known_languages:
        if pl in pls.lower():
            score += 1
    return score

prepared_data = df.copy()

prepared_data['Worker.profession'] = [jobs.index(job) for job in df['Worker.profession'].tolist()]
prepared_data['Worker.profession'] = [genders.index(gender) for gender in df['Worker.gender'].tolist()]
prepared_data['Worker.country'] = [countries.index(country) for country in df['Worker.country'].tolist()]
prepared_data['Worker.whereLearnedToCode'] = [score_for_where_learned_to_code(where_learned_to_code) for where_learned_to_code in df['Worker.whereLearnedToCode'].tolist()]
prepared_data['Worker.programmingLanguage'] = [score_for_programming_languages(str(pl)) for pl in df['Worker.programmingLanguage'].tolist()]
prepared_data['Answer.option'] = [answer.index(answer) for answer in df['Answer.option'].tolist()]
prepared_data['FailingMethod'] = [method_to_int(method) for method in df['FailingMethod'].tolist()]
prepared_data['Worker.gender'] = [genders.index(gender) for gender in df['Worker.gender'].tolist()]

prepared_data

Unnamed: 0,FailingMethod,Answer.duration,Answer.confidence,Answer.difficulty,GroundTruth,TP,TN,FN,FP,Answer.option,...,Code.complexity,Worker.score,Worker.profession,Worker.yearsOfExperience,Worker.age,Worker.gender,Worker.whereLearnedToCode,Worker.country,Worker.programmingLanguage,Answer.explanationComplexity
0,8,90.984,4,2,0,0,1,0,0,0,...,1,5,0,7.0,21,0,100,0,4,16
1,8,133.711,5,1,0,0,1,0,0,0,...,1,4,1,10.0,25,1,1150,0,2,13
2,8,77.696,5,2,0,0,1,0,0,0,...,1,5,0,7.0,24,0,1150,0,4,29
3,8,46.644,1,1,0,0,1,0,0,0,...,1,5,0,20.0,38,0,1100,1,2,11
4,8,215.416,5,5,0,0,1,0,0,0,...,1,3,0,5.0,19,0,1150,0,3,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,54,220.420,2,4,0,0,1,0,0,0,...,1,3,0,2.0,28,0,1000,1,2,30
2576,54,322.790,4,3,0,0,0,0,1,0,...,1,5,0,17.0,39,0,1050,1,3,28
2577,54,159.530,5,1,0,0,1,0,0,0,...,1,4,0,10.0,31,0,1150,11,2,10
2578,54,68.578,5,1,0,0,1,0,0,0,...,1,4,0,4.0,19,0,1050,0,2,10


In [108]:
test_set_size = 20
test_set_ratio = test_set_size / len(df)

X = prepared_data.drop(columns=['GroundTruth', 'TP', 'FP', 'TN', 'FN', 'Answer.explanation'])
y = prepared_data.iloc[:, 4:5]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=test_set_ratio)

## Part 2: Train Model

In [113]:
dtc = DecisionTreeClassifier(criterion='entropy', ccp_alpha=0.004)
dtc.fit(X_train, y_train)

In [114]:
y_pred = dtc.predict(X_test)

In [115]:
from sklearn.metrics import confusion_matrix # type: ignore
from sklearn.metrics import classification_report # type: ignore

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88        14
           1       1.00      0.33      0.50         6

    accuracy                           0.80        20
   macro avg       0.89      0.67      0.69        20
weighted avg       0.84      0.80      0.76        20



In [116]:
features = pd.DataFrame(dtc.feature_importances_, index=X.columns, columns=['Importance'])
features.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
Code.LOC,1.0
FailingMethod,0.0
Worker.profession,0.0
Worker.programmingLanguage,0.0
Worker.country,0.0
Worker.whereLearnedToCode,0.0
Worker.gender,0.0
Worker.age,0.0
Worker.yearsOfExperience,0.0
Worker.score,0.0
