# Import All Necessary Modules And Setup Project

If you get any errors when importing these, ensure you run the commands:
```bash
$ python -m pip install -r requirements.txt
```
to install all necessary modules for this project. This command must be run from inside of this project directory.

It is recommended to use virtual environments for this project to ensure there is no conflicting package versions on your system.

Activate the virtual environment (if needed), run the pip install command, and then launch Jupyter Lab inside this project to get this project running.

In [None]:
# Uncomment the following line to execute the pip install
# %pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

from measure_incremental_development.compute import calculate_mid

## Load Datasets

In [None]:
keystroke_df_unedited = pd.read_csv("data/keystrokes.csv")
student_df_unedited = pd.read_csv("data/students.csv")

#### Copy Datasets For Modification

This preserves the initial datasets, in case we ever need to bring an unedited column/row back into anything

In [None]:
keystroke_df = keystroke_df_unedited.copy()
student_df = student_df_unedited.copy()

## Get DF Representing Single Student Submission And File

In [None]:
def getFileInStudentSubmission(df, student, assignment, fileName):
    '''
    Returns a dataframe that shows all submission data for a single file on a given assignment, for a given student

    df - a dataframe of keystroke data
    student - a string representing the SubjectID to filter by
    assignment - a strinng representing the filename to filter by
    fileName - the name of the file to filter by

    Returns a COPY of the dataframe, containing only the selected student, assignment, and file
    '''
    f = df[
          (df.SubjectID == student)
        & (df.AssignmentID == assignment)
        & (df.CodeStateSection == fileName)
        ].copy()
    return f

In [None]:
def getStudentSubmission(df, student, assignment):
    '''
    Returns a dataframe that shows all submission data for all files on a given assignment, for a given student

    df - a dataframe of keystroke data
    student - a string representing the SubjectID to filter by
    assignment - a string representing the filename to filter by
    
    Returns a COPY of the dataframe, containing only the selected student and assignment
    '''
    f = df[
          (df.SubjectID == student)
        & (df.AssignmentID == assignment)
        ].copy()

    return f

In [None]:
def filterDownToRunAndEdits(df):
    return df[(
            (df.EventType == 'File.Edit')
          | (df.EventType == 'Run.Program')
          )]

In [None]:
def filterDownToRunAndEditsAndPastes(df):
    return df[(
            (df.EventType == 'File.Edit')
          | (df.EventType == 'Run.Program')
          | (df.EventType == 'X-Paste')
          )]

In [None]:
def getStudentSubmissionRunsAndEdits(df, student, assignment):
    '''
    Returns a dataframe that shows all submission data for a given assignment, for a given student, with only run and edit events

    df - a dataframe of keystroke data
    student - a string representing the SubjectID to filter by
    assignment - a strinng representing the filename to filter by
    '''
    return filterDownToRunAndEdits(getStudentSubmission(df, student, assignment))

In [None]:
def getFileInStudentSubmissionRunsAndEdits(df, student, assignment, fileName):
    '''
    Returns a dataframe that shows all submission data for a single file on a given assignment, for a given student, with only run and edit events

    df - a dataframe of keystroke data
    student - a string representing the SubjectID to filter by
    assignment - a strinng representing the filename to filter by
    fileName - the name of the file to filter by
    '''

    return filterDownToRunAndEdits(getFileInStudentSubmission(df, student, assignment, fileName))

## Reconstruct Submissions

In [None]:
def reconstructSingleFileDebugger(df):
    '''
    See a submission as it gets reconstructed, ignoring the Run events

    This is used primarily for debugging
    '''

    s = ''

    fileStateCount = 0
    for _,row in df[df.EventType=='File.Edit'].iterrows():
        i = int(row.SourceLocation)
        fileLengthRn = len(s)
        if i > fileLengthRn:
            print("DEBUG: Woah buddy, your cursor is out of bounds")


        insert = '' if pd.isna(row.InsertText) else row.InsertText
        delete = '' if pd.isna(row.DeleteText) else row.DeleteText
        s = s[:i] + insert + s[i+len(delete):]

        print(f"""\
================
StateNo : {fileStateCount}
{i=}
{insert=}
{delete=}
{row.EventID=}
================
{s}
""")
        fileStateCount += 1

    return s

In [None]:
def reconstructFinalFile(df):
    '''
    Reconstruct a single file

    df - A dataframe containing a single student, for a single assignment, 
            and a single file of that assignment

    Returns: A string representing the final state of the file
    '''

    s = ''

    for _,row in df[df.EventType=='File.Edit'].iterrows():
        i = int(row.SourceLocation)

        insert = '' if pd.isna(row.InsertText) else row.InsertText
        delete = '' if pd.isna(row.DeleteText) else row.DeleteText
        s = s[:i] + insert + s[i+len(delete):]
    
    return s

In [None]:
def reconstructFileAtRunEvents(df):
    '''
    Reconstruct a single file and save it's state at different run events

    df - A dataframe containing a single student, for a single assignment, 
            and a single file of that assignment. Must have edit events and run events

    Returns: A list containing strings representing the various states of the file at different run events
    '''

    fileStateAtRuns = []

    s = ''

    for _,row in df.iterrows():
        if row.EventType == 'File.Edit':
            i = int(row.SourceLocation)

            insert = '' if pd.isna(row.InsertText) else row.InsertText
            delete = '' if pd.isna(row.DeleteText) else row.DeleteText
            s = s[:i] + insert + s[i+len(delete):]
        elif row.EventType == 'Run.Program':
            fileStateAtRuns.append(s)

    fileStateAtRuns.append(s)

    return fileStateAtRuns

In [None]:
def reconstructProjectAtRunEvents(df):
    '''
    Reconstruct a whole project and save it's state at different run events

    df - A dataframe containing a single student, for a single assignment, 
            and one+ files. Must have edit events and run events

    Returns: A dictionary containing keys for each file name, with values that are a list
                of strings, representing each file state at different run events

    NOTE: Some files may always be empty in their states. This happens when students make meaningful 
            edits to files, but delete it's contents before running their code
    '''

    # Remove NaN edit events to ensure that only meaningful edits are logged
    df = df[
          ( df.EventType != 'File.Edit' )
          | ( ( df.EventType == 'File.Edit' )
            & ~ ( ( pd.isna(df.InsertText) ) 
                & ( pd.isna(df.DeleteText) ) 
                )
              )
        ]

    allFileNames = df.CodeStateSection.unique()
    
    fileStates = dict()
    currentFileContents = dict()
    # Setup the fileStates dictionary for every file and the currentFileContents for each file
    for fileName in allFileNames:
        fileStates[fileName] = []
        currentFileContents[fileName] = ''

    for _,row in df.iterrows():
        if row.EventType == 'File.Edit':
            # Get the name of the file that was just edited
            thisFileName = row.CodeStateSection

            # Get edit information
            i = int(row.SourceLocation)
            insert = '' if pd.isna(row.InsertText) else row.InsertText
            delete = '' if pd.isna(row.DeleteText) else row.DeleteText

            # Make the edit and save the current state of the file
            s = currentFileContents[thisFileName]
            s = s[:i] + insert + s[i+len(delete):]
            currentFileContents[thisFileName] = s
            
        elif row.EventType == 'Run.Program':
            # Save all files at their current states

            # We only want to save when a run is started
            if row['X-Metadata'] != 'Start': continue

            for fileName in allFileNames:
                fileStates[fileName].append(currentFileContents[fileName])

    # Save all files at their final states
    for fileName in allFileNames:
        fileStates[fileName].append(currentFileContents[fileName])

    return fileStates

#### View Reconstructed Project

In [None]:
def viewFinalReconstructedProject(reconstructedFiles):
    '''
    View the final file states that get reconstructed

    reconstructedFiles : A dictionary containing filenames as keys and a list
        of said file at various states

    Returns nothing, prints to the console
    '''

    for fileName in sorted(reconstructedFiles.keys()):
        print(40*"=")
        print(f"file = {fileName}")
        print(40*"=")
        print(reconstructedFiles[fileName][-1])

In [None]:
def viewReconstructedProjectStates(reconstructedFiles):
    '''
    View the project at all the different run states that get reconstructed

    reconstructedFiles : A dictionary containing filenames as keys and a list
        of said file at various states

    Returns nothing, prints to the console
    '''
    numberOfStates = len(reconstructedFiles[reconstructedFiles.keys()[0]])

    for stateNo in range(numberOfStates):
        print("v" * 40)
        print(f"Project State : {stateNo if stateNo != numberOfStates - 1 else 'FINAL'}")
        print("^" * 40)
        for fileName in sorted(reconstructedFiles.keys()):
            print(40*"=")
            print(f"file = {fileName}")
            print(40*"=")
            print(reconstructedFiles[fileName][stateNo])

In [None]:
student10df = getFileInStudentSubmissionRunsAndEdits(
    keystroke_df, 
    'Student10',
    'Assign10',
    'wordinator.py'
    )

In [None]:
student10FileStates = reconstructFileAtRunEvents(student10df)

for i,fileState in enumerate(student10FileStates): 
    print("="*40)
    print(f"File State : {i}")
    print("="*40)
    print(fileState)

In [None]:
student36df = getStudentSubmissionRunsAndEdits(
    keystroke_df, 
    'Student36',
    'Assign10',
    )

In [None]:
display(student_df)

In [None]:
student36FileStates = reconstructProjectAtRunEvents(student36df)
viewFinalReconstructedFiles(student36FileStates)

## Add MID Library

- 0-2 Likely Incremental
- 2-2.5 Somewhat Incremental
- 2.5-3 Somewhat Non-Incremental
- 3+ Likely Non-Incremental

#### Calculate MID statistc for student and assignmemt

In [None]:
SUBJECT_ID_KEY = 'SubjectID'
ASSIGNMENT_ID_KEY = 'AssignmentID'
ASSIGNMENT_FILE_KEY = 'File'
MID_SCORE_KEY = 'MID_Score'
INCREMENTAL_KEY = 'Incremental' # boolean key in df, <=2.5 Incremental, >2.5 Non-Incremental

In [None]:
def remove_empty_at_start(file_states):
    '''
    Function to remove empty strings at beginning of list

    file_states: list of file states

    returns a clean list
    '''
    index = -1
    for i in range(len(file_states)):
        if file_states[i] != '':
            index = i
            break
    if index > -1:
        return file_states[index:]
    return file_states

In [None]:
# Test for fun
fake_file_states = ['', '', '', 'fake', 'fake1', 'fake2', '', '', '']
clean_fake_file_states = remove_empty_at_start(fake_file_states)
print(fake_file_states)
print(clean_fake_file_states)

In [None]:
def get_mid_score_combo(single_student_df):
    '''
    Get a final mid score for multiple a combination of tasks in an assignment
     for a student

    single_student_df - A dataframe containing a single student, for a single assignment, 
            and one+ files. Must have edit events and run events

    Returns: A numerical value containing the mid score
    '''
    all_file_states = []
    file_states = reconstructProjectAtRunEvents(single_student_df)
    for file, state in file_states.items():
        # remove empty strings at beginning of list
        clean_state = remove_empty_at_start(state)
        all_file_states += clean_state
    return calculate_mid(all_file_states)

In [None]:
single_student_df = getStudentSubmissionRunsAndEdits(keystroke_df,
                                              'Student10',
                                              'Assign10')


mid_score_combo = get_mid_score_combo(single_student_df)

print(mid_score_combo)

In [None]:
def get_mid_score_dict(single_student_df):
    '''
    Get a dictionary containing file in assignment and mid score for a student

    single_student_df - A dataframe containing a single student, for a single assignment, 
            and one+ files. Must have edit events and run events

    Returns: A dictionary containing keys for each file name, with a numerical value
    containing the mid score
    '''
    file_mid_dict = dict()
    # save a dictionary for each file and the mid score
    file_states = reconstructProjectAtRunEvents(single_student_df)
    for file, state in file_states.items():
        # remove empty strings at beginning of list
        clean_state = remove_empty_at_start(state)
        file_mid_dict[file] = calculate_mid(clean_state)
    return file_mid_dict

In [None]:
single_student_df = getStudentSubmissionRunsAndEdits(keystroke_df,
                                              'Student10',
                                              'Assign10')


mid_score_dict = get_mid_score_dict(single_student_df)

print(mid_score_dict)

In [None]:
# TODO add mid score for concatenated assignment tasks

In [None]:
def get_mid_score_row(student, assignment, file, mid_score):
    '''
    Get a dictionary containing necessary information to build a row
    in the student dataframe with MID score
    
    student - a string representing the SubjectID to filter by
    assignment - a strinng representing the filename to filter by
    file - the name of the file in assignment
    
    Returns: A dictionary containing student, assignment, file, mid score
    and whether or not the returned score is incremental or not
    '''
    return {
        SUBJECT_ID_KEY: student,
        ASSIGNMENT_ID_KEY: assignment,
        ASSIGNMENT_FILE_KEY: file,
        MID_SCORE_KEY: mid_score,
        INCREMENTAL_KEY: 0 if mid_score > 2.5 else 1
    }


In [None]:
def get_student_mid_score_df(single_student_df):
    '''
    Creates a dataframe for a student, assignment, file and 
    the incremental information

    single_student_df - A dataframe containing a single student, for a single assignment, 
        and a single file of that assignment. Must have edit events and run events

    Returns: A dataframe for the specified student containing the mid score
    and whether or not the student used incremental development
    '''
    df = pd.DataFrame(columns=[SUBJECT_ID_KEY, ASSIGNMENT_ID_KEY, ASSIGNMENT_FILE_KEY, MID_SCORE_KEY, INCREMENTAL_KEY])

    mid_score_dict = get_mid_score_dict(single_student_df)

    for file_name, mid_score in mid_score_dict.items():
        df_row = pd.DataFrame(get_mid_score_row(single_student_df[SUBJECT_ID_KEY].unique(), single_student_df[ASSIGNMENT_ID_KEY].unique(), file_name, mid_score))
        df = pd.concat([df, df_row], ignore_index=False)
    return df

In [None]:
student_mid_df = get_student_mid_score_df(single_student_df)
display(student_mid_df)

In [None]:
def get_keystroke_mid_score_df(keystroke_df, students, assignments):
    '''
    Creates a dataframe for specified students and assignments containing
    incremental information

    keystroke_df - a dataframe of keystroke data
    students - a list of strings representing the SubjectIDs to filter by
    assignment - a list strings representing the assignment to filter by

    Returns: A dataframe for the specified students and assignments containing 
    the mid score and whether or not the student used incremental development
    '''
    keystroke_mid_score_df = pd.DataFrame()
    for student in students:
        for assignment in assignments:
            single_student_df = getStudentSubmissionRunsAndEdits(keystroke_df,
                                              student,
                                              assignment)
            df = get_student_mid_score_df(single_student_df)
            keystroke_mid_score_df = pd.concat([keystroke_mid_score_df, df], ignore_index=True)
    return keystroke_mid_score_df

In [None]:
def get_all_keystroke_mid_score_df(keystroke_df):
    '''
    Creates a dataframe for all students and assignments containing
    incremental information

    keystroke_df - a dataframe of keystroke data

    Returns: A dataframe for the students and assignments containing 
    the mid score and whether or not the student used incremental development
    '''
    return get_keystroke_mid_score_df(keystroke_df, keystroke_df[SUBJECT_ID_KEY].unique(), keystroke_df[ASSIGNMENT_ID_KEY].unique())

In [None]:
# Doesn't work wahhhh :( 
# display(get_all_keystroke_mid_score_df(keystroke_df))

In [None]:
all_students = keystroke_df[SUBJECT_ID_KEY].unique()
all_assignments = keystroke_df[ASSIGNMENT_ID_KEY].unique()
print(all_students)
print(all_assignments)

In [None]:
key_stroke_mid_score_df = get_keystroke_mid_score_df(keystroke_df, ['Student10', 'Student36'], ['Assign10', 'Assign12'])
display(key_stroke_mid_score_df)

In [None]:
FINAL_SCORE_KEY = 'FinalScore'
SCORE_KEY = 'Score'

In [None]:
def get_scores(student, assignment, student_df):
    '''
    Get the score the student got on the specified student

    student: a string representing the SubjectID
    assignment: a string representing the Assignment
    student_df: a df containing the students grades

    Returns: a dictionary of the student's score for the specified assignment, 
    and final score
    None if there is no 
    '''
    # TODO Maybe we want to add more scores, we can do that here
    scores = dict()
    if student in student_df[SUBJECT_ID_KEY].unique():
        row_loc = student_df.loc[student_df[SUBJECT_ID_KEY] == student]
        if assignment in row_loc:
            scores[SCORE_KEY] = row_loc[assignment].values[0]
            scores[FINAL_SCORE_KEY] = row_loc[FINAL_SCORE_KEY].values[0]
    return scores

In [None]:
print(get_scores('Student10', 'Assign10', student_df))

In [None]:
def get_avg_mid_score(student, assignment, mid_score_df):
    '''
    Get the average mid score for a students assignment
    This will be the average of the mid scores for each
    file of each assignment

    student: a string representing the SubjectID
    assignment: a string representing the Assignment
    mid_score_df: a df containing the mid score for each assignment
    file for each student
    '''
    
    pass