# Zoom grader

Loads zoom reports from a pre-defined folder, and calculates attendance.

### Recently identified complicatoins

Unlike in Neuro class, some people in biosem didn't put their emails on zoom, or only did it late.

Plan:

1. Load data from BIP
1. Process Zoom logs, ignore repetitions, summarize
1. Filter only one class
1. Read the actual list of students (separate for every class)
1. Use a substitutition table "name --> email" (separate file, one for all classes) to either substitute entries, or disable entries (special "command" for making it ignore an entry)
1. Using email as id, connect two databases. Output summary, as well as a list of failed entries (present on Zoom, but absent on BIP).

Piazza grader:

1. Load a list of Piazza people
2. Go through saved Piazza contributions, tabulate participation
3. Read attendance summary from part 1 (above)
4. Output a full table (screen & csv)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

In [None]:
folder_name = '../../data/attendance/'

In [None]:
# Read attendance data

df = pd.DataFrame()
file_list = os.listdir(folder_name)
print("Total files: ",len(file_list))

processed_set = set()
processed_dict = {}
for fname in file_list:
    if fname[-3:]!='csv':
        continue # Ignore everything that is not a csv
    fullname = folder_name + fname
    header = pd.read_csv(fullname, header=0, nrows=1) # Attempt to read the header
    if 'Topic' not in header.columns: # Not a zoom log, skip this one
        continue
    name = header['Topic'][0]    
    datestring = header['Start Time'][0][:10]
    if name+datestring in processed_set:
        print(f"Duplicated meeting: {name+datestring} ({fname}) already saved as {processed_dict[name+datestring]}")
        continue  # This meeting
    processed_set.add(name+datestring)
    processed_dict[name+datestring] = fname
    
    data = pd.read_csv(fullname, header=2)
    data['Date'] = datestring
    data['Meeting'] = name
    data['User Email'] = data['User Email'].fillna('none') # NaNs are ignored by aggregation below
    
    df = df.append(data, ignore_index=True)

print('Meeting files:', len(processed_set))

In [None]:
# Rename fields, standardize meetings that had synonyms

df= df.rename({'User Email': 'email', 
               'Total Duration (Minutes)': 'minutes', 
               'Name (Original Name)': 'name',
               'Meeting':'meeting',
               'Date':'date'}, axis=1)
meeting_dict = {'Biosem Zoom': 'Biosem', 
                'Biosem_spring_2021': 'Biosem',
                'Neuroscience': 'Neuro',
                'Computational Neuro': 'Comput'}
df['meeting'] = df['meeting'].replace(meeting_dict)
df.name = df.name.str.title()  # Capitalize names (for consistency)

In [None]:
# Inexplicably, some total participation counts are stored as strings, not numbers
# So correct that.

df.loc[df.minutes=='Yes'] = 0 # No idea what it means, but Zoom outputed it!!!
ind = [type(a)!=int for a in df.minutes.values]
df.loc[ind, 'minutes'] = [int(a) for a in df.loc[ind].minutes.values]
df = df[df.minutes>0]  # Remove weird empty entries

# df['check'] = 1*(df.minutes>40) # Simple attendance
df['check'] = 0.3*(df.minutes>30) + 0.7*(df.minutes>50) # Full attendance and late classes

In [None]:
# What meetings are even there?
set(df.meeting)

In [None]:
# Participation length histogram
plt.figure(figsize=(10,3))
plt.subplot(121)
plt.hist(df.loc[df.meeting=='Biosem'].minutes.values, bins=50);
plt.title('Biosem');
plt.subplot(122)
plt.hist(df.loc[df.meeting=='Neuro'].minutes.values, bins=50);
plt.title('Neuro');

In [None]:
# Only analyze one type of meetings starting from here

target_meeting = 'Neuro' # Options: {'Biosem', 'Comput', 'Neuro'}
df = df[df.meeting==target_meeting]

In [None]:
# Check individual records of needed
token = 'Ell'
all_meetings = df.groupby('date').agg({'name':'count'}).reset_index()
partial = df.loc[df.name.str.match(token)]
out = all_meetings[['date']].merge(partial, on='date', how='left')[['date','name','email','minutes','check']]
out.loc[out.check.isna(),'check'] = 0
print(sum(out.check))
out

In [None]:
# Read the actual list of students. Note that the file name needs to match the meeting name.

people = pd.read_csv(folder_name + target_meeting + '.csv', header=0, sep=',')
people.columns = [' '.join(s.split()) for s in people.columns] # Remove repeating spaces, just in case
# print(people.columns)
people['name'] = people['FIRST NAME'] + ' ' + people['LAST NAME']
people = (people
          .drop_duplicates() # A left-over from the "All meetings at once" pipeline, but let's keep for a while
          .reset_index()
          .rename({'ID#':'id', 'Email': 'email'}, axis=1)
          .loc[:,['id', 'email', 'name']]
         )
print(people.columns)
# people.head()

In [None]:
# Troubleshooting: A full list of meetings with their attendance.
# Check if the plot makes sense.

dfm = df.groupby(['date','meeting','name']).agg({'minutes': sum})
dfm.minutes = 1*(dfm.minutes>10)
dfm = dfm.groupby(['date', 'meeting']).agg({'minutes': sum}).reset_index()
dfm = dfm.rename({'minutes':'people'}, axis=1)
dfm.date = pd.to_datetime(dfm.date)
print("Meetings so far:", dfm.shape[0])

plt.figure(figsize=(9,2))
plt.plot(dfm.date.dt.dayofyear, dfm.people, '.');
plt.xlabel('Meeting');
plt.ylabel('People');

# print(dfm)

In [None]:
# Check if there are any entries without email at all
df[[len(a)<10 for a in df.email]]

In [None]:
# Read and process known alts

df_alts = pd.read_csv(folder_name + 'alts.csv', header=0, sep=',')

# First find official emails where available, and drop some users manually (e.g. the instructor)
df_fixed = (df
            .merge(df_alts, on='email', how='left', suffixes=['', '_r'])
            .drop(columns=['name_r'])
            .query("real_email != 'none'")
           )

# Substitute 'wrong' emails with 'official' emails where available
ind = (df_fixed.real_email.notna())
df_fixed.loc[ind, 'email'] = df_fixed.loc[ind, 'real_email']
df_fixed = df_fixed.drop(columns='real_email')

In [None]:
# Find people with emails that aren't on the official list
# Output them, to manually create a list of alt-emails (that will be )

df_lost = (df_fixed           
           .groupby(['name','email'])
           .agg({'minutes': sum})
           .reset_index()
           .merge(people, on='email', how='left', suffixes=['', '_r']) # Ignore official name (_r)
           .drop(columns='name_r')           
           .query('id.isna()', engine='python') # Only keep unrecognized users
          )
print('Rows found:', len(df_lost.id))
#print('\n'.join([a for a in df_lost['name'] + ',' + df_lost['email']])) # Output csv

#df_lost # Output nice human-readable form

In [None]:
# Lookup correct names, and drop all users who are not on bip
df_fixed = (df_fixed
            .merge(people, on='email', how='left', suffixes=['_l', '']) # This time keep official names only
            .drop(columns=['name_l', 'id', 'Guest'])
#            .query('name.notna()', engine='python') # Drop people who are not on bip
           )

df_fixed

In [None]:
# Build a summary

dfs = (df_fixed
       .groupby(['name','email'])
       .agg({'check': sum})
       .reset_index()
      )

# dfs.sort_values(by='check')
# dfs

In [None]:
# **Troubleshooter**: for a given student, find all meetings they did and did not attend.

df_meetings = pd.DataFrame({'date': [a for a in set(df_fixed['date'].values)]})
token = 'yyy'
out = (df_meetings
       .merge(df_fixed.loc[df_fixed.name.str[:len(token)] == token], on='date', how='left')
       .sort_values(by='date')
       .query('minutes.notna()', engine='python'))
out

# Part 2: Piazza grader

In [None]:
# Read all students enrolled on Piazza, with their Piazza names
# Here make sure the list has only 1 email, and this email is correct. Some people like to include
# more than one email for some reason...

people_piazza = pd.DataFrame(columns=['name', 'email'])
lines = open(folder_name + 'Piazza.list', 'r', encoding='utf-8').readlines()
for line in lines:
    l = line.strip().split(' ')
    email = l[-1]
    name = ' '.join(l[:-1])
    if name != '': # Ignore unregistered users
        people_piazza = people_piazza.append({'name': name, 'email': email}, ignore_index=True)
        
# people_piazza

In [None]:
# Read all piazza pages. All of them should be saved as txt files.
# NOTE: There's no safety check on the content of TXT files, so no other TXT files are allowed in this folder!
# In each page, identify responses, and count special marks.

df_piazza = pd.DataFrame(columns=['name', 'work', 'grade', 'message'])
cool = '🔥💡💎'
rejected = '🛑'
splitter = 'Resolved Unresolved'

file_list = os.listdir(folder_name)
for fname in file_list:
    if fname[-3:]!='txt':
        continue # Ignore everything that is not a zoom log    
    s = open(folder_name + fname, 'r', encoding='utf-8').read()
    title = s[:s.find('\n')]
    print(fname, ':', title)
    
    messages = s.split(splitter)[1:]  # Skip the homework itself (number 0)
    for message in messages:
        first = max(0, message.find('days ago')+9)
        last  = message.find('\nhelpful! ')        
        
        grade = ''
        for emoji in cool:
            if emoji in message:
                grade = 'good'
        for emoji in rejected:
            if emoji in message:
                grade = 'bad'
        for name in people_piazza.name:        
            if name in message:
                df_piazza = df_piazza.append({'name': name, 'work':title, 'grade':grade,
                                              'message':message[first:last]}, ignore_index=True)

In [None]:
# Look at one student

q = df_piazza[df_piazza.name.str.match('John')]
q

#for i in range(q.shape[0]):
#    print(q.message.iloc[i], end='\n---------\n')

In [None]:
# Now summarize, then merge left on the full list of people

df_piazza['good'] = df_piazza.grade.str.match('good')
df_piazza['bad']  = df_piazza.grade.str.match('bad')

dfsp = (df_piazza        
        .groupby(['name'])
        .agg({'work':'count', 'good':'sum', 'bad':'sum'})
        .reset_index()
        .rename({'work':'total'}, axis=1)
       )
dfsp = (people_piazza
        .merge(dfsp, on='name', how='left')
       )

dfsp.loc[dfsp.total.isna(), ['total', 'good', 'bad']] = 0
dfsp.total = dfsp.total.astype(int)
dfsp.good  = dfsp.good.astype(int)
dfsp.bad   = dfsp.bad.astype(int)

# dfsp

In [None]:
# One student
dfsp[dfsp.name.str.match('John')]

In [None]:
# Full output

df_full = dfsp.merge(dfs, on='email', suffixes=['_piazza','_zoom'])
with pd.option_context('display.max_rows', 14000, 'display.width', 1000):
    # print(df_full)
    pass

In [None]:
# How many missed assignments are problematic?
# For Neuro this semester we seem to have 34 classes and 21 homework

print('Can miss classes, and still pass:', 0.2*34)
print('Can miss classes, and still get a B:', 0.1*34)
print('Can miss homeworks, and still pass:', 0.2*21)
print('Can miss homeworks, and still get a B:', 0.1*21)

In [None]:
# At-risk students

classes_so_far = 17
works_so_far = 10
df_at_risk = df_full.query('total-bad < @works_so_far-1 | check < @classes_so_far-3')

# print(','.join([e for e in df_at_risk.email]))
#df_at_risk

In [None]:
# Grades

df_full['grade'] = 'B'
df_full.grade = df_full.grade.where(~(df_full.check < classes_so_far-3), 'C')
df_full.grade = df_full.grade.where(~(df_full.check < classes_so_far-6), 'F')
df_full.grade = df_full.grade.where(~(df_full.total < works_so_far-2), 'C')
df_full.grade = df_full.grade.where(~(df_full.total < works_so_far-4), 'F')
df_full.grade = df_full.grade.where(~((df_full.good > 2) & (df_full.grade=='B')), 'A')
df_full.grade = df_full.grade.where(~((df_full.good > 1) & (df_full.grade=='B')), 'B+')
#df_full.head(200)

In [None]:
def response(df_full, df_piazza, i):
    # Write a full response for one student
    
    sec = df_full.iloc[i]
    name = sec.name_piazza
    
    missing_work = (df_piazza
                  .groupby(['work', 'name'])
                  .agg({'message':'count'})
                  .reset_index()
                  .pivot_table(index='work', columns='name', values='message')
                  .reset_index()
                  [['work', name]]
                  .set_axis(['work', 'name'], axis=1, inplace=False)
                  .query('name.isna()', engine='python')
                  .work
                  .tolist()
                 )
    
    def remove_double_spaces(s):
        s = s.replace('\n', ' ')
        return ' '.join([c for c in s.split(' ') if c])
    
    print(f"Dear {sec.name_piazza.split()[0]},")
    print(f"you have attended {np.round(sec.check).astype(int)} classes ", end='')
    print(f"(out of {classes_so_far} that we had so far), ")
    print(f"and submitted {sec.total} homeworks (out of {works_so_far} that we had)")   
    if sec.good>0:
        print(f"{sec.good} of these homeworks were really good!! (and others were also fine :)")
    if sec.total < works_so_far:
        print(f"(Works that are missing: {'; '.join(missing_work)})")
    print(f"With this in mind, your current technically projected grade is {sec.grade}")
    if sec.total < works_so_far:
        message = """If you want to improve, you can submit missing homeworks. 
              For missed lab work, just do the lab, and submit your work, as described in the assignment. 
              For missed questions, at this point the assignment is a bit different.
              You will need to answer the following question: 
              'What is the most interesting thing that you learned that week, and what makes it interesting to you?''. 
              I would expect about a paragraph of text, with explicit references to what you learned from 
              videos and readings, and to what we discussed in class. Additional sources are also welcome!"""
        print(remove_double_spaces(message))

In [None]:
i = np.where(df_full.name_piazza.str.match('Ell'))[0][0]
response(df_full, df_piazza, i)

In [None]:
for i in range(df_full.shape[0]):
    response(df_full, df_piazza, i)
    print("\n")

In [None]:
# Ouput the summary csv
#recognized.to_csv(folder_name + '../attendance_summary.csv', index=False)