# Zoom grader

Loads zoom reports from a pre-defined folder, and calculates attendance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

In [None]:
hide_names = 0
# Set this global var to 1 before pushing to git (preserves anonymity)

In [None]:
folder_name = '../../data/attendance/'

In [None]:
# Read all attendance data

df = pd.DataFrame()

file_list = os.listdir(folder_name)
print("Total files: ",len(file_list))

for fname in file_list:
    # print(fname)
    if fname[-3:]!='csv':
        continue # Ignore everything that is not a zoom log
    fullname = folder_name + fname
    header = pd.read_csv(fullname, header=0, nrows=1)
    name = header['Topic'][0]
    datestring = header['Start Time'][0][:10]
    # print(fname, header['Start Time'].values)    
    
    data = pd.read_csv(fullname, header=2)
    data['Date'] = datestring
    data['Meeting'] = name
    data['User Email'] = data['User Email'].fillna('none') # NaNs are ignored by aggregation below
    
    df = df.append(data, ignore_index=True)

In [None]:
df= df.rename({'User Email': 'email', 
               'Total Duration (Minutes)': 'minutes', 
               'Name (Original Name)': 'name',
               'Meeting':'meeting',
               'Date':'date'}, axis=1)
meeting_dict = {'Neuro Lab - Sep 10': 'Neuro', 
                "Arseny Khakhalin's Zoom Meeting": 'Neuro', 
                "Intro Neuro": 'Neuro',
                'Biosem 00': 'Biosem', 'Bard Biosem': 'Biosem'}
df['meeting'] = df['meeting'].replace(meeting_dict)
df.name = df.name.str.title() # Capitalize (for consistency)
df['check'] = 1*(df.minutes>30)

In [None]:
# Check individual records of needed
#df.loc[df.name.str[:3]=='Han']

In [None]:
# Read the list of students
people = pd.read_csv(folder_name + 'people.txt', header=0, sep='\t')
people['name'] = people['first'] + ' ' + people['last']
people = people.drop_duplicates()
people = people.reset_index()
people.columns

In [None]:
# ALl names
#for i in range(people.shape[0]):
#    print(f"{people.loc[i, 'first']} {people.loc[i,'last']}", end=' | ')

In [None]:
# A list of meetings
dfm = df.groupby(['date','meeting','name']).agg({'minutes': sum})
dfm.minutes = 1*(dfm.minutes>10)
dfm = dfm.groupby(['date', 'meeting']).agg({'minutes': sum}).reset_index()
dfm = dfm.rename({'minutes':'people'}, axis=1)
dfm.date = pd.to_datetime(dfm.date)

plt.figure(figsize=(9,2))
plt.plot(dfm.date.dt.dayofyear, dfm.people, '.');
plt.xlabel('Meeting');
plt.ylabel('People');

In [None]:
# Looking at attendance of selected students
key = 'Dan'
res = df.loc[df.name.str[:len(key)]==key].sort_values(by='date')
print(res.shape[0], 'rows')
res

In [None]:
# ALl possible dates
# sorted(df.date.unique().tolist())

In [None]:
# Looking at all participants on a selected date
# df.loc[df.date=='10/26/2020'].sort_values(by='name')

In [None]:
# Iteratively build summaries

dfs = df.groupby(['date', 'meeting', 'name', 'email']).agg({'minutes' : sum})
dfs = dfs.reset_index()                                 # Get rid of hierarchical indices
dfs['check'] = 1*(dfs.minutes>30)                       # Participation threshold

dfs = dfs.groupby(['name', 'meeting', 'email']).agg({'check': sum}).reset_index()
dfs.name = dfs.name.str.strip()                         # Remove leading and ending spaces

# Normalize names based on the official record where email is available
dfs = dfs.merge(people, on='email', how='left', suffixes=['','_r'])
ind = (dfs.email != 'none') & (dfs.name_r.notna())
dfs.loc[ind,'name'] = dfs.loc[ind,'name_r']
dfs = dfs.drop(columns=['name_r','index','first','last'])
# print(dfs.loc[dfs.name.str[:3]=='Dan'])

In [None]:
with pd.option_context('display.max_rows', 1400, 'display.width', 1000):
    if not hide_names:
        # print(dfs)
        pass

In [None]:
# Use a manually created list of synonims to unify spelling
# Then look for official emails

alts = pd.read_csv(folder_name + 'alt-people.txt', header=0, sep=',') # A dict of alt-names
excluded = pd.read_csv(folder_name + 'exclude.txt', header=0, sep=',')

recognized = dfs.copy().merge(alts, on='name', how='left')

# Replace alt-names with real names, where alt-names were found
ind = recognized.translation.notnull()
recognized.loc[ind, 'name'] = recognized.loc[ind,'translation']

# Combine different copies of the same person, then re-link emails
recognized = (recognized
              .drop(columns='translation')                            
              .groupby(['name','meeting']).agg({'check': sum}).reset_index()
              .merge(people, on='name', how='left', suffixes=['', '_r'])
              .drop(columns=['first', 'last', 'index'])
              )

# Sanity checks row
# print(alts.loc[alts.name.str[:3]=='Equ'])

lost = recognized[recognized.email.isnull()]
lost = (lost.merge(excluded, on='name', how='left', indicator=True).
        query('_merge=="left_only"').
        drop(columns=['_merge','email']))

with pd.option_context('display.max_rows', 1400, 'display.width', 1000):
    if not hide_names:        
        print(lost)

In [None]:
with pd.option_context('display.max_rows', 14000, 'display.width', 1000):
    print(recognized.sort_values('meeting'))
    pass

In [None]:
# Ouput the summary csv
recognized.to_csv(folder_name + '../attendance_summary.csv', index=False)

In [None]:
# print('\n'.join(df2.loc[df2.meeting=="Neuro"].sort_values(by='name').name.tolist()))

In [None]:
# df2.loc[df2.meeting=='Neuro'].sort_values(by='name').drop(columns=['meeting', 'email'])