<a href="https://colab.research.google.com/github/kchenTTP/ors_test_grading_automation/blob/main/ORS_Grading_Assessments_Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ORS Assessment Test Automatic Grader
Automate grading assessment tests and output a report in excel format

**Requirments**
1. Get the correct answers of the assessment
1. Filter by student (also filter by datetime for current cohort)
1. Find the pre-class assessment
1. Find the post-class assessment
1. Find reattempts of tests
1. Show only answers of the questions students get wrong
1. Transpose from wide data to long data
  1. Column: Attempts (labels: pre-class test, post-class test, reattempts)
  1. Row: Scores, Questions 1 - 20
1. Save into `.xlsx` file

**Tasks**
- Provide dataset of the students (`.csv` or google sheets api)
- Provide student names

## Install Required Libraries & Import

In [None]:
!pip install weasyprint

In [None]:
import pandas as pd
import numpy as np
from weasyprint import HTML, CSS

## Create Folders & Upload Files
- Assessment test responses `.csv` file from this [link](https://drive.google.com/drive/folders/142C-KrYeCN2GnEUiyou6cvDYIOyOWKkt?usp=drive_link)

- Student information from this [link](https://nyplorg-my.sharepoint.com/:x:/g/personal/kangchen_nypl_org/EVcsHhpkqM9Fteg3pnvZBZoBRawkm43iJZF-3YE5hgRyCA?email=kangchen%40nypl.org&e=l5Vz5w)

In [None]:
!mkdir data output

## Set Variables

In [None]:
test_dates =['2023-09-16', '2023-10-14']
test_dates = [pd.to_datetime(date) for date in test_dates]
program = 'word'

## Process Names

In [None]:
# set variables
df = pd.read_csv('./data/ors_info.csv')
df

### Preprocess names

In [None]:
# drop unactive students
df = df.loc[df.Drop != True]

In [None]:
student_info = df[['FirstName', 'LastName', 'Email']]

In [None]:
names = df[['FirstName', 'LastName']]

In [None]:
names.loc[names.FirstName == 'Eleanor', ['FirstName']] = 'E'

In [None]:
# fix error values firstname = lastname
names.iloc[2,0] =names.iloc[2][0].split(' ')[0]
names.iloc[2,1] =names.iloc[2][1].split(' ')[1]

In [None]:
names.FirstName = names.FirstName.str.lower().str.strip()
names.LastName = names.LastName.str.lower().str.replace(' ', '')

In [None]:
fullnames = pd.DataFrame(names.FirstName + ' ' + names.LastName, columns=['fullname'])

In [None]:
fullnames

## Process Responses

In [None]:
assess_df = pd.read_csv('./data/ORS_Word_Assessment_Responses.csv')
assess_df.info()

In [None]:
# rename columns
col_names_to_replace = list(assess_df.columns)
col_names = ['timestamp', 'email', 'score', 'firstname', 'lastname']

for i in range(1,21):
  col_names.append(f'Q{i}')

col_mapper = dict.fromkeys(col_names_to_replace)
for i, col in enumerate(col_names_to_replace):
  col_mapper[col] = col_names[i]

assess_df.rename(columns=col_mapper, inplace=True)

### Preprocess first and last name

In [None]:
assess_df.firstname = assess_df.firstname.str.lower()
assess_df.lastname = assess_df.lastname.str.lower()
assess_df.head(5)

In [None]:
# correct error values: elsa divinagracia = elsa
assess_df.loc[(assess_df.lastname == 'wilson') & (assess_df.firstname == 'elsa divinagracia'), 'firstname'] = 'elsa'

### Get Answers

In [None]:
answer_row = assess_df[assess_df.score == '100 / 100'].tail(1).reset_index(drop=True)
answer_row.iloc[:,:5] = np.nan
answer_key = answer_row.iloc[:,5:]
answer_dict = answer_key.to_dict(orient='records')[0]

In [None]:
answer_dict

### Get Data Base on Test Date and Student Name

In [None]:
# convert timestamp to datetime dtype
assess_df.timestamp = pd.to_datetime(assess_df.timestamp, format='%m/%d/%Y %H:%M:%S', errors='coerce')

In [None]:
assess_df.info()

In [None]:
# get all tests from test dates
all_word_assessment = assess_df[assess_df.timestamp.dt.date.isin([d.date() for d in test_dates])]

In [None]:
# check student names and test count
all_word_assessment[all_word_assessment.firstname.isin(names.FirstName)].firstname.value_counts().sort_index()

In [None]:
# all section 4 word test data
sect4_word_test = all_word_assessment[all_word_assessment.firstname.isin(names.FirstName)]
sect4_word_test.reset_index(drop=True, inplace=True)

## Generate Excel Report

### Filter Incorrect Answers

In [None]:
info_df = sect4_word_test.iloc[:,:5] # student info
responses_df = sect4_word_test.iloc[:,5:] # student response
wrong_answer_filter = responses_df != pd.concat([answer_key] * 31, ignore_index=True)
wrong_answer_df = responses_df[wrong_answer_filter] # retain answer values that are incorrect

In [None]:
# final dataframe with all student information and the questions the got wrong
final_results = pd.concat([info_df, wrong_answer_df], axis=1)

In [None]:
final_results.head(2)

### Save All Results to Dictionary

In [None]:
cols_to_show_list = []
grades_dict = {}
cols_to_drop = ['timestamp', 'email', 'score', 'firstname', 'lastname']

# iterate through each row
for i, row in final_results.iterrows():
  cols = list(final_results.columns[row.notna()]) # columns that don't contain null values
  cols_to_show_list.append(cols)

  # student answers
  stu_name = row.firstname.strip() + " " + row.lastname.strip()
  test_time = str(row.timestamp)

  # check if name exists
  if grades_dict.get(stu_name) == None:
    grades_dict[stu_name] = {}
  if grades_dict[stu_name].get(test_time) == None:
    grades_dict[stu_name][test_time] = {
        'score': row.score.split(' / ')[0],
        'res': pd.DataFrame(row.loc[cols]).T.drop(columns=cols_to_drop),
        'ans': answer_row[cols].drop(columns=cols_to_drop)
    }
  else:
    print(row.timestamp)

In [None]:
grades_dict.keys()

In [None]:
list(list(grades_dict.values())[0].values())[0]['score']

In [None]:
list(list(grades_dict.values())[0].values())[0]['res']

In [None]:
list(list(grades_dict.values())[0].values())[0]['ans']

### Save Results -> DataFrame -> Excel

In [None]:
# check length of data
names.shape[0] == final_results.firstname.value_counts().count()

In [None]:
names.shape[0] == final_results.lastname.value_counts().count()

In [None]:
questions = pd.DataFrame(col_names_to_replace.copy(), index=col_names).T
questions.iloc[0,0] = 'Questions'
questions.iloc[:,1:5] = np.nan
questions

In [None]:
answer_row.iloc[0,0] = 'Answers'
answer_row

In [None]:
# concat all dataframes and save as excel file
for i, row in names.iterrows():
  fname = row.FirstName
  lname = row.LastName

  cols = final_results.columns[final_results[final_results.firstname == fname].notna().any()]
  report = pd.concat([questions, answer_row, final_results[final_results.firstname == fname]], axis=0).reset_index(drop=True)[cols]
  report.fillna('-', inplace=True)
  report.drop(columns=['email', 'firstname', 'lastname'], inplace=True)
  report.rename(columns={'timestamp': 'Index'}, inplace=True)
  report.set_index('Index', inplace=True)

  report.to_excel(f'./output/{fname}_{lname}_{program}_report.xlsx', f'{fname}_{lname}')

In [None]:
# save files
!zip -r /content/grades.zip /content/output/

from google.colab import files
files.download("/content/grades.zip")