In [None]:
import pandas as pd
import numpy as np

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from projectConstants import * 

## Load All Data Into `*_unedited` Data Frames

In [None]:
mid_scores_unedited = pd.read_csv("data/mid_scores.csv")
mid_scores = mid_scores_unedited.copy()
timeBetweenRuns_unedited = pd.read_csv('./data/timeBetweenRuns.csv')
timeBetweenRuns = timeBetweenRuns_unedited.copy()
coding_sessions_unedited = pd.read_csv("data/codingSessions.csv")
coding_sessions = coding_sessions_unedited.copy()
assignment_keystroke_info_unedited = pd.read_csv("data/assignmentKeystrokeInfo.csv")
assignment_keystroke_info = assignment_keystroke_info_unedited.copy()

In [None]:
mid_scores_with_runtime_info = mid_scores.merge(timeBetweenRuns, on=[SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY])

In [None]:
mid_scores_with_runtime_info

#### Generate Merged Data Frames

In [None]:
mid_scores_with_assignment_info = mid_scores.merge(assignment_keystroke_info, on=[SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY])

In [None]:
mid_scores_with_assignment_info

### Plot Mid Scores over Assignment

In [None]:
mid_scores.sort_values('AssignmentID')

In [None]:
sns.scatterplot(data=mid_scores, x='AssignmentID', y='MID_Score', hue='SubjectID')

In [None]:
sns.scatterplot(data=mid_scores, x='MID_Score', y='AssignmentScore')
plt.savefig('./images/AssignmentScore_MidScore.png')

In [None]:
mid_scores.columns

In [None]:
variables = filter(lambda v: v != 'Unnamed: 0', mid_scores.columns)
pairs = list(itertools.combinations(variables, 2))

fig, axs = plt.subplots(len(pairs) // 2, 2, figsize=(12,48))
for col in range(len(pairs) // 2):
    for row in range(2):
        pair = pairs.pop(0)
        ax = axs[col][row]
        sns.scatterplot(data=mid_scores, x=pair[0], y=pair[1], hue='Incremental', ax=ax)
        ax.set_title(f'{pair[0]} vs {pair[1]}')
fig.tight_layout()

In [None]:
sns.boxplot(data=mid_scores, x='AssignmentID', y='MID_Score')

In [None]:
sns.countplot(data=mid_scores, x='AssignmentID')

#### Mid Score in Relation to Final Grade

In [None]:
students = mid_scores[SUBJECT_ID_KEY].unique()

score_df = pd.DataFrame()

for student in students: 
    student_df = mid_scores.loc[mid_scores[SUBJECT_ID_KEY] == student]
    # do those with 3 or more assignments
    if len(student_df[ASSIGNMENT_ID_KEY]) > 3:
        row = pd.DataFrame({
                SUBJECT_ID_KEY: student,
                FINAL_SCORE_KEY: student_df[FINAL_SCORE_KEY].unique()[0],
                'IncrementalPercent': ((student_df[INCREMENTAL_KEY]==1).sum() / student_df[INCREMENTAL_KEY].count()) * 100
        }, index=[0])
        score_df = pd.concat([score_df, row], ignore_index=True)
    # print(incremental_development_count)
score_df = score_df.dropna()

ax = sns.scatterplot(x=FINAL_SCORE_KEY, y='IncrementalPercent', data=score_df)
ax.set(title='Incremental Development in Relation to Final Score')
ax.set_ylabel("Percentage of Incremental Development Over Course")
ax.set_xlabel("Final Score")

### Analysis of Time Between Runs

In [None]:
# look at assignments for each student, average time between runs per assignment
averagePerRunDf = pd.DataFrame()
students = timeBetweenRuns[SUBJECT_ID_KEY].unique()
assignments = timeBetweenRuns[ASSIGNMENT_ID_KEY].unique()
timeBetweenRunsDf = timeBetweenRuns.copy()
studentAssignment = pd.DataFrame()
for student in students:
    for assignment in assignments: 
        studentAssignment = mid_scores_with_runtime_info[(mid_scores_with_runtime_info.SubjectID == student)&(mid_scores_with_runtime_info.AssignmentID == assignment)].copy()
        # mean = studentAssignment[DIFF_KEY].mean()
        studentAssignment = studentAssignment.dropna()
        # display(studentAssignment)
        hours = studentAssignment.groupby([SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY]).agg({HOURS_DIFF_KEY : sum}).reset_index()[HOURS_DIFF_KEY].values
        hoursAsSeconds = (hours[0] if len(hours) > 0 else 0) * 3600
        
        minutes = studentAssignment.groupby([SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY]).agg({MINUTES_DIFF_KEY : sum}).reset_index()[MINUTES_DIFF_KEY].values
        minutesAsSeconds = (minutes[0] if len(minutes) > 0 else 0) * 60

        seconds = studentAssignment.groupby([SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY]).agg({SECONDS_DIFF_KEY : sum}).reset_index()[SECONDS_DIFF_KEY].values
        secondsValue = (seconds[0] if len(seconds) > 0 else 0)

        # get number of runs per assignment
        numRuns = (studentAssignment[ASSIGNMENT_ID_KEY] == assignment).sum()
        if len(studentAssignment) > 0:
            # averageTime = pd.to_timedelta(int((hoursAsSeconds + minutesAsSeconds + secondsValue) / len(studentAssignment)), unit='s')
            averageTime = int((hoursAsSeconds + minutesAsSeconds + secondsValue) / len(studentAssignment))
            averageRunRow = pd.DataFrame({SUBJECT_ID_KEY: student, 
                                          ASSIGNMENT_ID_KEY: assignment, 
                                          MID_SCORE_KEY: studentAssignment[MID_SCORE_KEY].unique()[0] if len(studentAssignment[MID_SCORE_KEY].unique()) > 0 else -1, 
                                          'AverageTimePerRun': averageTime,
                                          'NumRunsPerAssignment': numRuns}, 
                                          index=[0])
            averagePerRunDf = pd.concat([averagePerRunDf, averageRunRow], ignore_index=True)
display(averagePerRunDf)

In [None]:
for assignment in assignments:
    # x is student
    # y is average time
    assignmentAvgPerRunDf = averagePerRunDf[(averagePerRunDf.AssignmentID == assignment)].copy()
    ax = sns.scatterplot(x=SUBJECT_ID_KEY, y='AverageTimePerRun', data=assignmentAvgPerRunDf)
    ax.set(title=f'Avg Time Between Runs Per Student {assignment}')
    ax.set_ylabel("Avg Time Between Runs")
    ax.set_xlabel("Student")
    plt.figure()

In [None]:
for student in students:
    # x is student
    # y is average time
    studentAvgPerRunDf = averagePerRunDf[(averagePerRunDf.SubjectID == student)].copy()
    if studentAvgPerRunDf[ASSIGNMENT_ID_KEY].unique().size > 3:
        ax = sns.barplot(x=ASSIGNMENT_ID_KEY, y='AverageTimePerRun', data=studentAvgPerRunDf)
        ax.set(title=f'Avg Time Between Runs Per Assignment {student}')
        ax.set_ylabel("Avg Time Between Runs")
        ax.set_xlabel("Assignment")
        plt.figure()

In [None]:
sns.regplot(data=averagePerRunDf, y='NumRunsPerAssignment', x=MID_SCORE_KEY)

In [None]:
sns.boxplot(data=mid_scores_with_assignment_info, y=INCREMENTAL_KEY, x=SESSION_COUNT_KEY, orient='h')

In [None]:
sns.boxplot(data=mid_scores_with_assignment_info, y=INCREMENTAL_KEY, x=AVG_KEYDIFF_TIME_KEY, orient='h')

In [None]:
sns.boxplot(data=mid_scores_with_assignment_info, y=INCREMENTAL_KEY, x=NUMBER_KEYSTROKES_KEY, orient='h')

In [None]:
sns.regplot(data=mid_scores_with_assignment_info, y=SESSION_COUNT_KEY, x=MID_SCORE_KEY)

In [None]:
sns.regplot(data=mid_scores_with_assignment_info, y=TOTAL_ASSIGNMENT_TIME_KEY, x=MID_SCORE_KEY)

In [None]:
mid_scores_with_assignment_info