# Grading

In [26]:
import pandas as pd

In [None]:
dfRos = pd.read_excel("./data/roster.xlsx", dtype={"sec": str})
dfRos.head()

In [None]:
dfRos.info()

In [None]:
# Check for duplicates
dfRos.duplicated().sum()

In [None]:
dfGroup = pd.read_excel("./data/student_groups.xlsx")
dfGroup.head()

In [None]:
# I expect null values since not all students form a group.
dfGroup.info()

In [None]:
# Check for duplicates
dfGroup.duplicated().sum()

In [33]:
# The group name comes from user input which can have multiple spaces and leading and trailing spaces.
# I want to clean it.
import re


def formatGroupName(text):
    out = text.strip()
    out = re.sub(r"\s+", " ", out)
    return out


dfGroup["group_name"] = dfGroup["group_name"].fillna("")
dfGroup["group_name"] = dfGroup["group_name"].apply(formatGroupName)

In [None]:
# Also, I want to make the group key (for merging) from the group name to make merging robust against accidentally modifying group name with space or capitalization.
# This logic will be used in the other data as well.
def makeGroupKey(text):
    out = re.sub(r"\s+", "", text)
    out = out.lower()
    return out


dfGroup["group_key"] = dfGroup["group_name"].apply(makeGroupKey)
dfGroup["group_key"].value_counts()

In [None]:
dfrs = dfRos.merge(
    dfGroup[["student_id", "group_name", "group_key"]],
    left_on="student_id",
    right_on="student_id",
    how="left",
)
dfrs.head()

In [None]:
# Check if there are any students in section 003, 006, 803, 806 with no group.
# Turns out there are.
filtSec = dfrs["sec"].isin(["003", "006", "803", "806"])
dfrmFiltSec = dfrs[filtSec]
filtNotNull = dfrmFiltSec["group_name"].isnull()
display(dfrmFiltSec[filtNotNull])

In [None]:
# Find group_name that is not empty
filtNotEmpty = dfGroup["group_name"] != ""
dfGroupNotEmpty = dfGroup.loc[filtNotEmpty] 

# Find student_id in the group assignment that is not in the roster.
filtMismatchId = ~dfGroupNotEmpty["student_id"].isin(dfRos["student_id"])
dfGroupNotEmpty[filtMismatchId]

In [None]:
# Get index of the mismatched id. Let's assume that the problematic id is 943301355.
# In reality, I double checked with the student first.
filtStudent = dfGroup["student_id"] == 943301355
idx = filtStudent[filtStudent].index
print(idx)

In [39]:
# Update student_id to be the one in the roster.
dfGroup.loc[idx, 'student_id'] = 228248149

In [None]:
# Rerun the merge.
dfrs = dfRos.merge(
    dfGroup[["student_id", "group_name", "group_key"]],
    left_on="student_id",
    right_on="student_id",
    how="left",
)

# Recheck if there are any students in section 003, 006, 803, 806 with no group.
# Now there is no problem.
filtSec = dfrs["sec"].isin(["003", "006", "803", "806"])
dfrmFiltSec = dfrs[filtSec]
filtNotNull = dfrmFiltSec["group_name"].isnull()
display(dfrmFiltSec[filtNotNull])

In [None]:
# Check if there are any empty group_name.
# Turns out there are.
filtNotEmpty = dfrmFiltSec["group_name"] == ""
dfrmFiltSec[filtNotEmpty]

In [42]:
# Ask the student about the group name.
filtStu = dfGroup["student_id"] == 543046351
dfGroup[filtStu]

# Update group_name
# Again, I need to ask the student first.
dfGroup.loc[filtStu, "group_name"] = "Sec3: no 123"
dfGroup.loc[filtStu, "group_key"] = "sec3:no123"

In [None]:
# Rerun the merge.
dfrs = dfRos.merge(
    dfGroup[["student_id", "group_name", "group_key"]],
    left_on="student_id",
    right_on="student_id",
    how="left",
)

# Recheck if there are any students in section 003, 006, 803, 806 with no group.
# Now there is no problem
filtSec = dfrs["sec"].isin(["003", "006", "803", "806"])
dfrmFiltSec = dfrs[filtSec]
filtNotNull = dfrmFiltSec["group_name"].isnull()
display(dfrmFiltSec[filtNotNull])

# Check if there are any empty group_name. Yes!
filtNotEmpty = dfrmFiltSec["group_name"] == ""
display(dfrmFiltSec[filtNotEmpty])

In [None]:
dfrs.head()

In [None]:
dfGroupGrade = pd.read_excel("./data/group_grade.xlsx")
dfGroupGrade.head(3)

In [None]:
dfGroupGrade["group_name"] = dfGroupGrade["group_name"].fillna("")
dfGroupGrade["group_name"] = dfGroupGrade["group_name"].apply(formatGroupName)
dfGroupGrade.head(3)

In [None]:
dfGroupGrade["group_key"] = dfGroupGrade["group_name"].apply(makeGroupKey)
dfGroupGrade["group_key"].value_counts()

In [None]:
# Notice the use of suffixes to avoid duplicate column names
dfStuGrade = pd.merge(dfrs, dfGroupGrade, on="group_key", how="left", suffixes=("","_y"))
dfStuGrade = dfStuGrade.drop(columns=["sec_y", "group_name_y"])
dfStuGrade.head(3)

In [None]:
# Check if the students in section 003, 006, 803, 806 have the score.
filtSec = dfStuGrade["sec"].isin(["003", "006", "803", "806"])
dfStuGradeFiltSec = dfStuGrade[filtSec]

# We see that no row has null value in the "total" column. 
filtNull = dfStuGradeFiltSec["total"].isnull()
dfStuGradeFiltSec[filtNull]

In [50]:
dfStuGrade.to_excel("out_stu_grade.xlsx", index=False)