# Import All Necessary Modules And Setup Project

If you get any errors when importing these, ensure you run the commands:
```bash
$ python -m pip install -r requirements.txt
```
to install all necessary modules for this project. This command must be run from inside of this project directory.

It is recommended to use virtual environments for this project to ensure there is no conflicting package versions on your system.

Activate the virtual environment (if needed), run the pip install command, and then launch Jupyter Lab inside this project to get this project running.

In [None]:
# Uncomment the following line to execute the pip install
# %pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

from measure_incremental_development.compute import calculate_mid, classify_snapshots


## Get DF Representing Single Student Submission And File

In [None]:
from getSubmissionDataframes import *

`getSubmissionDataframes` contains the following functions:

*   `getFileInStudentSubmission`
*   `getStudentSubmission`
*   `filterDownToRunAndEdits`
*   `filterDownToRunAndEditsAndPastes`
*   `getStudentSubmissionRunsAndEdits`
*   `getFileInStudentSubmissionRunsAndEdits`

## Reconstruct Submissions

In [None]:
from reconstructSubmissions import *

`reconstructSubmissions` has the functions:

*   `reconstructSingleFileDebugger`
*   `reconstructFinalFile`
*   `reconstructFileAtRunEvents`
*   `reconstructProjectAtRunEvents`

#### View Reconstructed Project

In [None]:
from viewReconstructions import *

`viewReconstructions` has the following functions:

*   `viewFinalReconstructedProject`
*   `viewReconstructedProjectStates`

#### Get Student Project Info

In [None]:
from getStudentProjectInfo import *

`getStudentProjectInfo` has the following function:

*   `getStudentProjectList`

## Load Datasets

In [None]:
keystroke_df_unedited = pd.read_csv("data/keystrokes.csv")
student_df_unedited = pd.read_csv("data/students.csv")

#### Copy Datasets For Modification

This preserves the initial datasets, in case we ever need to bring an unedited column/row back into anything

In [None]:
keystroke_df = keystroke_df_unedited.copy()
student_df = student_df_unedited.copy()

#### Testing Some Reconstructions

In [None]:
student10df = getFileInStudentSubmissionRunsAndEdits(
    keystroke_df, 
    'Student10',
    'Assign10',
    'wordinator.py'
    )

In [None]:
student10FileStates = reconstructFileAtRunEvents(student10df)

for i,fileState in enumerate(student10FileStates): 
    print("="*40)
    print(f"File State : {i}")
    print("="*40)
    print(fileState)

In [None]:
student36df = getStudentSubmissionRunsAndEdits(
    keystroke_df, 
    'Student36',
    'Assign10',
    )

In [None]:
display(student_df)

In [None]:
student36FileStates = reconstructProjectAtRunEvents(student36df)
viewFinalReconstructedProject(student36FileStates)

In [None]:
projects_df, run_events_df, final_data = getStudentProjectList(student_df, keystroke_df)

print(len(projects_df), len(run_events_df))

In [None]:
# Get all students with a submission for the assignment
for student, assign, df in final_data:
    if len(df) > 0:
        print(student, assign)

In [None]:
for student, assign, df in final_data:
    if len(df) > 0:
        # print(len(df))
        print(50*'=')
        print(student, assign)
        viewFinalReconstructedProject(df)
        print(50*'=')


## Add MID Library

- 0-2 Likely Incremental
- 2-2.5 Somewhat Incremental
- 2.5-3 Somewhat Non-Incremental
- 3+ Likely Non-Incremental

#### Calculate MID statistc for student and assignmemt

In [None]:
SUBJECT_ID_KEY = 'SubjectID'
ASSIGNMENT_ID_KEY = 'AssignmentID'
MID_SCORE_KEY = 'MID_Score'
INCREMENTAL_KEY = 'Incremental' # boolean key in df, <=2.5 Incremental, >2.5 Non-Incremental
ASSIGNMENT_SCORE_KEY = 'AssignmentScore'
FINAL_SCORE_KEY = 'FinalScore'
CLIENT_TIMESTAMP_KEY = 'ClientTimestamp'

In [None]:
def remove_empty_at_start(file_states):
    '''
    Function to remove empty strings at beginning of list

    file_states: list of file states

    returns a clean list
    '''
    index = -1
    for i in range(len(file_states)):
        if file_states[i] != '':
            index = i
            break
    if index > -1:
        return file_states[index:]
    return file_states

In [None]:
# Test for fun
fake_file_states = ['', '', '', '\nfake', '\nfake\n', '\nfak\n', '\nfake\n', '\n', '\n']
clean_fake_file_states = remove_empty_at_start(fake_file_states)
print(fake_file_states)
print(clean_fake_file_states)

calculate_mid(fake_file_states)

In [None]:
def get_scores(student, assignment, student_df):
    '''
    Get the score the student got on the specified student

    student: a string representing the SubjectID
    assignment: a string representing the Assignment
    student_df: a df containing the students grades

    Returns: a dictionary of the student's score for the specified assignment, 
    and final score
    None if there is no 
    '''
    # TODO Maybe we want to add more scores, we can do that here
    scores = dict()
    if student in student_df[SUBJECT_ID_KEY].unique():
        row_loc = student_df.loc[student_df[SUBJECT_ID_KEY] == student]
        if assignment in row_loc:
            scores[ASSIGNMENT_SCORE_KEY] = row_loc[assignment].values[0]
            scores[FINAL_SCORE_KEY] = row_loc[FINAL_SCORE_KEY].values[0]
    return scores

In [None]:
print(get_scores('Student10', 'Assign10', student_df))

In [None]:
def get_mid_score_row(student, assignment, mid_score):
    '''
    Get a dictionary containing necessary information to build a row
    in the student dataframe with MID score
    
    student - a string representing the SubjectID to filter by
    assignment - a strinng representing the filename to filter by
    file - the name of the file in assignment
    
    Returns: A dictionary containing student, assignment, file, mid score
    and whether or not the returned score is incremental or not
    '''
    scores = get_scores(student, assignment, student_df)
    return {
        SUBJECT_ID_KEY: student,
        ASSIGNMENT_ID_KEY: assignment,
        MID_SCORE_KEY: mid_score,
        ASSIGNMENT_SCORE_KEY: scores[ASSIGNMENT_SCORE_KEY],
        FINAL_SCORE_KEY: scores[FINAL_SCORE_KEY],
        INCREMENTAL_KEY: 0 if mid_score > 2.5 else 1
    }


In [None]:
def get_mid_score_all(final_data):
    '''
    Creates a dataframe for a student, assignment, mid score, final score, 
    assignment score and the incremental information

    final_data - multiple tuples containing a single student, for a single assignment, 
        and file states.

    Returns: A dataframe for all specified student containing the mid score
    and whether or not the student used incremental development
    '''
    mid_score_df = pd.DataFrame()
    for student, assignment, file_states in final_data:
        if len(file_states) == 0:
            continue
        all_file_states = []
        for _, state in file_states.items():
            clean_state = remove_empty_at_start(state)
            all_file_states += clean_state
        try:
            mid_score = calculate_mid(all_file_states)
            row = pd.DataFrame(get_mid_score_row(student, assignment, mid_score), index=[0])
            mid_score_df = pd.concat([mid_score_df, row], ignore_index=True)
        except Exception as e:
            print("Failed for student. Error:", e)
            print(len(all_file_states))
    return mid_score_df

In [None]:
mid_df = get_mid_score_all(final_data)

In [None]:
mid_df.to_csv('./data/mid_scores.csv')

## Code to get the time between runs

In [None]:
DATE_TIME_KEY = 'DateTime'
NEXT_DATE_TIME_KEY = 'NextDateTime'
DIFF_KEY = 'Diff'
DAYS_DIFF_KEY = 'DaysDiff'
HOURS_DIFF_KEY = 'HoursDiff'
MINUTES_DIFF_KEY = 'MinutesDiff'
SECONDS_DIFF_KEY = 'SecondsDiff'
SESSION_ID_KEY = 'SessionID'
SESSION_TIME_KEY = 'SessionTime'
TOTAL_ASSIGNMENT_TIME_KEY = 'TotalAssignmentTime'

In [None]:
def getTimestampRow(row):
    '''Get a dictionary containing information for date time'''
    return {
        SUBJECT_ID_KEY: row[SUBJECT_ID_KEY],
        ASSIGNMENT_ID_KEY: row[ASSIGNMENT_ID_KEY],
        CLIENT_TIMESTAMP_KEY: row[CLIENT_TIMESTAMP_KEY],
        DATE_TIME_KEY: pd.to_datetime(row.ClientTimestamp, unit='ms')       
    }

In [None]:
def getFilteredRunEvents(df):
    '''
    Get run events with execution action and start events only
    '''
    runDf = pd.DataFrame()
    
    for _, row in df.iterrows():
        if row.EventType == 'Run.Program':
            if row['X-Metadata'] != 'Start':
                continue
            timeRow = pd.DataFrame(getTimestampRow(row), index=[0])
            runDf = pd.concat([runDf, timeRow], ignore_index=True)
    return runDf

In [None]:
runEvents = getFilteredRunEvents(keystroke_df)

In [None]:
def getTimeBetweenRuns(df, student, assignment):
    '''Get time between runs for a student and assignment'''
    studentRunsDf = df[(df.SubjectID == student)&(df.AssignmentID == assignment)]
    studentRunsDf.sort_values(by=DATE_TIME_KEY, inplace=True)
    studentRunsDf[NEXT_DATE_TIME_KEY] = studentRunsDf[DATE_TIME_KEY].shift(-1)
    studentRunsDf[DIFF_KEY] = studentRunsDf[NEXT_DATE_TIME_KEY] - studentRunsDf[DATE_TIME_KEY]
    studentRunsDf[DAYS_DIFF_KEY] = round((studentRunsDf[NEXT_DATE_TIME_KEY] - studentRunsDf[DATE_TIME_KEY]).dt.days)
    studentRunsDf[HOURS_DIFF_KEY] = round((studentRunsDf[NEXT_DATE_TIME_KEY] - studentRunsDf[DATE_TIME_KEY]).dt.seconds / 3600.0, 2)
    studentRunsDf[MINUTES_DIFF_KEY] = round((studentRunsDf[NEXT_DATE_TIME_KEY] - studentRunsDf[DATE_TIME_KEY]).dt.seconds / 60.0, 2)
    studentRunsDf[SECONDS_DIFF_KEY] = round((studentRunsDf[NEXT_DATE_TIME_KEY] - studentRunsDf[DATE_TIME_KEY]).dt.seconds, 2)
    return studentRunsDf

In [None]:
def getTimeBetweenRunsDf(keystroke_df, final_data):
    '''Get time between runs for each student'''
    runEvents = getFilteredRunEvents(keystroke_df)
    timeDifferenceDf = pd.DataFrame()
    for student, assignment, _ in final_data:
        studentDf = getTimeBetweenRuns(runEvents, student, assignment)
        timeDifferenceDf = pd.concat([timeDifferenceDf, studentDf], ignore_index=True)
    return timeDifferenceDf

In [None]:
timeBetweenRunsDf = getTimeBetweenRunsDf(keystroke_df, final_data)

In [None]:
display(timeBetweenRunsDf)
timeBetweenRunsDf.to_csv('./data/timeBetweenRuns.csv')

## Get Coding Sessions
#### Defined as keypresses within 5 minutes of eachother

In [None]:
# use complete keystroke data
# student, assignment, number of coding sessions

def getCodingSessionsForAssignment(df, student, assignment):
    '''coding sessions end after more than 5 minutes from last event'''
    studentAssignmentDf = df[(df.SubjectID == student)&(df.AssignmentID == assignment)]
    # studentAssignmentDf = studentAssignmentDf[SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY, CLIENT_TIMESTAMP_KEY]
    # convert client timestamp to date time
    studentAssignmentDf[DATE_TIME_KEY] = pd.to_datetime(studentAssignmentDf.ClientTimestamp, unit='ms') 
    studentAssignmentDf[SESSION_ID_KEY] = -1
    # sort by timestamp
    studentAssignmentDf.sort_values(by=DATE_TIME_KEY, inplace=True)
    # give session number , if within 5 minutes, keep that session number numbers
    if studentAssignmentDf.size > 0: 
        lastEventTime = studentAssignmentDf.head(1)[DATE_TIME_KEY].values[0]
    # 5 minutes from last event, not start of the session
    sessionId = 0
    for i, row in studentAssignmentDf.iterrows():
        diff = row[DATE_TIME_KEY] - lastEventTime
        minutes_diff = diff.total_seconds() / 60
        if minutes_diff > 5:
            sessionId += 1
        studentAssignmentDf.at[i,SESSION_ID_KEY] = sessionId
        lastEventTime = row[DATE_TIME_KEY]
    return studentAssignmentDf 

In [None]:
def getTimeSpentPerSession(df):
    '''Get total time spent per session'''
    codingSessionDf = pd.DataFrame()
    sessionIds = df[SESSION_ID_KEY].unique()
    for sessionId in sessionIds:
        sessionDf = df[(df[SESSION_ID_KEY] == sessionId)]
        # sort by dates just in case
        sessionDf.sort_values(by=DATE_TIME_KEY, inplace=True)
        # subtract start session time and end session time to get total session time
        startTime = sessionDf.head(1)[CLIENT_TIMESTAMP_KEY].values[0]
        endTime = sessionDf.tail(1)[CLIENT_TIMESTAMP_KEY].values[0]
        sessionTime = endTime - startTime
        sessionDf[SESSION_TIME_KEY] = sessionTime
        codingSessionDf = pd.concat([codingSessionDf, sessionDf], ignore_index=True)
    return codingSessionDf

In [None]:
def getTimeSpentPerAssignment(df):
    '''get total time spent per assignment using coding session time'''
    assignmentTimeDf = pd.DataFrame()
    print("DATA FRAME")
    display(df)
    assignments = df[ASSIGNMENT_ID_KEY].unique()
    print("ASSIGNMENTS ", assignments)
    for assignment in assignments:
        assignmentDf = df[(df[ASSIGNMENT_ID_KEY] == assignment)]
        # get unique session times
        sessionTimes = assignmentDf[SESSION_TIME_KEY].unique()
        # add all times in list
        totalAssignmentTime = sum(sessionTimes)
        assignmentDf[TOTAL_ASSIGNMENT_TIME_KEY] = totalAssignmentTime
        assignmentTimeDf = pd.concat([assignmentDf, assignmentDf], ignore_index=True)
    return assignmentTimeDf

In [None]:
def getCodingSessionsDf(keystroke_df, final_data):
    '''Get coding session for each student and assignment, where coding session is within 5 minutes'''
    runEvents = getFilteredRunEvents(keystroke_df)
    codingSessionDf = pd.DataFrame()
    for student, assignment, _ in final_data:
        codingSessionAssignmentDf = getCodingSessionsForAssignment(runEvents, student, assignment)
        # get time on assignment
        # time spent per session
        codingSessionTimeDf = getTimeSpentPerSession(codingSessionAssignmentDf)
        # total time spent per assignment
        # print("GET TIME PER ASSIGNMENT ")
        print("CODING SESSION TIME DF ")
        display(codingSessionTimeDf)
        codingSessionTimeTotalDf = getTimeSpentPerAssignment(codingSessionTimeDf)
        codingSessionDf = pd.concat([codingSessionDf, codingSessionTimeDf], ignore_index=True)
    return codingSessionDf

In [None]:
codingSessions = getCodingSessionsDf(keystroke_df, final_data)

In [None]:
codingSessions.to_csv('./data/codingSessions.csv')