In [None]:
##########################################

# Start here once given anonymized csvs.

###########################################
%matplotlib inline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import LLB_custom_scripts_mac
from LLB_custom_scripts_mac import make_module_summary
import csv
from collections import Counter

In [None]:
# Loop through anonymized files and save the dataframes 
# of progression and module files in separate dictionaries.
# Also make a dictionary with the number of assessments in 
# each module, each year.

# Paths for locating anonymized csvs:
anon_module_csv_path = ("/Users/Kate/Desktop/Vicky project/"
                        "Data/anonymized module csvs/")

anon_progression_csv_path = ("/Users/Kate/Desktop/Vicky project/"
                            "Data/anonymized progression csvs/")

progression_files = {}
module_files = {}
n_assessments_module_dict = {}

# Loop
for filename in os.listdir(anon_module_csv_path):
    if filename == '.DS_Store': 
        continue
    else:
        # Convert csv to dataframe and store in a dictionary.
        module_name = f'{filename}'.split('.')[0]
        module_files[module_name] = pd.read_csv(
            anon_module_csv_path + f'{filename}')
        module_files[module_name].set_index('SPR Code', inplace=True)
        num_assessments = module_files[module_name].columns[-1].split(' ')[1]
        n_assessments_module_dict[module_name] = num_assessments
for filename in os.listdir(anon_progression_csv_path):
    if filename == '.DS_Store': 
        continue
    else:
        # Convert csv to dataframe and store in a dictionary.
        progression_files[f'{filename}'.split('.')[0]] = pd.read_csv(
            anon_progression_csv_path + f'{filename}')
        progression_files[f'{filename}'.split('.')[0]].set_index(
            'SPR Code', inplace=True)

In [None]:
# Remove all duplicates from module dataframes and keep the first instance
for dfname, df in module_files.items():
    module_files[dfname] = df[~df.index.duplicated(keep='first')]

In [None]:
# Remove all students with blank results
for dfname, df in module_files.items():
    module_files[dfname] = df[df['Result'].notnull()]

In [None]:
# Make pass/fail flag for each individual assessment.
# Assigns 'P' to 'P' or 'LP'
# Assigns 'F' to 'F', 'W', 'FR', 'DR', or 'LF'
# Identifies nan grades to be dealt with

for dfname, df in module_files.items():
    year, module, tmp = dfname.split('_')
    # Make a df to store assessment results
    num_assessments = int(n_assessments_module_dict[dfname])
    df_column_names = [f'Assessment {a} P or F' \
                       for a in range(1, num_assessments + 1)]
    df_idx = module_files[dfname].index
    P_F_status = pd.DataFrame(data=[], index=df_idx, \
                                 columns = df_column_names)

    # Populate df
    for index, row in module_files[dfname].iterrows():
        for a in range(1,num_assessments+1):
            grade = row[f'Assessment {str(a)} Grade']
            if grade in ['P','LP']:
                P_F_status.at[index, f'Assessment {str(a)} P or F'] = 'P'
            elif grade in ['F', 'W', 'FR', 'DR', 'LF']:
                P_F_status.at[index, f'Assessment {str(a)} P or F'] = 'F'
            else:
                P_F_status.at[index, f'Assessment {str(a)} P or F'] = np.NaN
                print(module, year, index, 'has nan grade.')
    # checking on these grade designations
    module_files[dfname] = df.join(P_F_status, how='left')

In [None]:
# For each module, get name of next year's module 
# and then determine students' repeat/pass status the following year.

for dfname, df in module_files.items():
    current_year = dfname.split('_')[0]
    module = dfname.split('_')[1]
    df['Retake next year'] = pd.Series(dtype='object') # necessary so not float
    df['Retake and pass'] = pd.Series(dtype='object')
    df['Retake and fail submit stat'] = pd.Series(dtype='object')
    if current_year == '201718':
        next_year = None
    else:
        next_year = dfname.split('_')[0][:2] + \
            str(int(dfname.split('_')[0][2:4]) + 1) + \
            str(int(dfname.split('_')[0][4:]) + 1) + '_' + \
            dfname.split('_')[1] + '_' + dfname.split('_')[2]
    # Determine if module retaken following year (could turn this into a function?)
    for index, row in df.iterrows():
        try:
            current_attempt_num = int(df.at[index, 'Attempt'])
        except:
            current_attempt_num = int(df.at[index, 'Attempt'][0])
        if next_year != None:
            if index in module_files[next_year].index.values:
                retake_attempt_num = int(module_files[next_year].at[index, 'Attempt'])
                diff_in_attempts = retake_attempt_num - current_attempt_num
                df.at[index,'Retake next year'] = f'Y, Attempt no. diff = {diff_in_attempts}'
                if module_files[next_year].at[index, 'Result'] == 'P':
                    df.at[index,'Retake and pass'] = True
                    df.at[index, 'Retake and fail submit stat'] = np.NaN
                else:
                    df.at[index,'Retake and pass'] = False 
                    if module != 'LSM':
                        if module_files[next_year].at[index, 'Mark'].astype('int') == 0:
                            df.at[index, 'Retake and fail submit stat'] = 'No submit'
                        else:
                            df.at[index, 'Retake and fail submit stat'] = 'Failed submit'
                    else:
                        df.at[index, 'Retake and fail submit stat'] = np.NaN
            else: # index not in next year's module
                df.at[index,'Retake next year'] = 'N'
                df.at[index,'Retake and pass'] = np.NaN
                df.at[index, 'Retake and fail submit stat'] = np.NaN
        else: # year is '201718'
            df.at[index,'Retake next year'] = np.NaN
            df.at[index,'Retake and pass'] = np.NaN
            df.at[index, 'Retake and fail submit stat'] = np.NaN
            if current_year != '201718':
                print(f'error parsing year')#no errors           
    module_files[dfname] = df

In [None]:
# get a visual sense of the retakes in tabular form

X = pd.DataFrame()

for dfname, df in module_files.items():
    year, name, assessment = dfname.split('_')
    if year != '201718':
        X[dfname] = module_files[dfname]['Retake and fail submit stat'].value_counts(normalize=True, dropna=False)
X = X.reindex(sorted(X.columns), axis=1)

# X

In [None]:
## Make module summary files

# Note: Students are given the opportunity to reassess for whatever assignment they failed 
# in order to get their grade higher for the final grade

metric_names = [
    'N (total)',
    '% Pass (total)','% Att. 1','% Pass (Att. 1)','% Att. 2','% Pass (Att. 2)','% Att. 3','% Pass (Att. 3)',
    '% Retake Next Year','% of Retake that Pass (total)', '% of Retake and Fail that Do Not Submit', 
    '% of Retake and Fail that Fail Submission', '% Reassess (Either Asst FR or DR)',
    '% Reassess that Pass (F Otherwise)','% Reassess that Pass (P Otherwise)', '% Reassess and final grade DR', 
    '% Reassess & No Sub at Reassess','% Reassess & Failed Sub at Reassess',
    'Asst. 1: % Pass','Asst. 1: % F (Not DR or FR)','Asst. 1: % W','Asst. 1: % FR','Asst. 1: % DR', 'Asst. 1: % LF', 
    'Asst. 1: % No Sub','Asst. 1: % Failed Sub','Asst. 1 F: % No Sub','Asst. 1 F: % Failed Sub',
    'Asst. 1: % FR that Pass (F Otherwise)','Asst. 1: % DR that Pass (F Otherwise)', 'Asst. 1: % Submit that Pass',
    'Asst. 2: % Pass','Asst. 2: % F (Not DR or FR)','Asst. 2: % W','Asst. 2: % FR','Asst. 2: % DR', 'Asst. 2: % LF',
    'Asst. 2: % No Sub','Asst. 2: % Failed Sub','Asst. 2 F: % No Sub','Asst. 2 F: % Failed Sub',
    'Asst. 2: % FR that Pass (F Otherwise)','Asst. 2: % DR that Pass (F Otherwise)', 'Asst. 2: % Submit that Pass'
]
module_names = ['LSM','CONAD','CONTRACT','TORT','CRIMINAL','LAND','EQUITY','EU','LT1','LT2']
year_names = ['201415','201516','201617','201718']
module_summary = {}
for module in module_names:
    module_summary[module] = pd.DataFrame(index=metric_names, columns=year_names)
             
for dfname, df in module_files.items():
    num_assessments = int(n_assessments_module_dict[dfname])
    year, module, tmp = dfname.split('_')
    ser = make_module_summary(df,dfname,num_assessments)
    module_summary[module][year] = ser

In [None]:
# Write module summaries to excel
outputpath = ("/Users/Kate/Desktop/Vicky project/Data/")
with pd.ExcelWriter(f'{outputpath}/Module Summaries.xlsx') as writer:  
    for dfname, df in module_summary.items():
        df.to_excel(writer, sheet_name=f'{dfname}')

In [None]:
# Make a graph that shows how pass rates vary by module and year

# Start by making a table that shows pass rates by module, then by year
graph1_df = pd.DataFrame()
for dfname, df in module_summary.items():
    tmp_df = df.loc['% Pass (total)', :].to_frame()
    tmp_index = np.array([dfname] * len(tmp_df))
    tmp_df.set_index(tmp_index, append=True, inplace=True)
    tmp_df = tmp_df.swaplevel()
    graph1_df = pd.concat([graph1_df, tmp_df])  

# Make graph

colors_dict = {'LSM': 'red', 'CONAD': 'blue', 'CONTRACT': 'green', 'TORT':'black', 'CRIMINAL': 'yellow', 
          'LAND': 'cyan', 'EQUITY': 'magenta', 'EU': 'red', 'LT1': 'blue', 'LT2': 'green'}

graph_colors=[colors_dict[i] for i in graph1_df.index.get_level_values(0)]

values = graph1_df['% Pass (total)'].values.tolist()
names_tuples = graph1_df.index.values.tolist()
names = [str1 + ' ' + str2 for (str1, str2) in names_tuples]
plt.figure(figsize=(10,5))
plt.bar(names, values, color=graph_colors)   
plt.xticks(fontsize=12, rotation =90)
plt.ylabel('Pass rate (%)', fontsize=20)
plt.show()

In [None]:
# test for effect of module and year

In [None]:
# make this graph above collapsed across years, also collasped across modules first year

In [None]:
# submission v non-submission collapsed across all first year assignments (before reassessment)

# Make a new df with all assignments first year

In [None]:
# what is the most common set of last classes that students take in the dataset before disappearing early?
# assumption: students are admitted at a relatively steady rate each year

student_attempts = pd.DataFrame()
student_results = pd.DataFrame()
for dfname, df in module_files.items():
    year, module, tmp = dfname.split('_')
    student_attempts = (student_attempts.join(df[['Attempt']], how='outer'))
    student_attempts.rename(columns={'Attempt': f'{year} {module}'}, inplace=True)
    student_results = (student_results.join(df[['Result']], how='outer'))
    student_results.rename(columns={'Result': f'{year} {module}'},inplace=True)
student_attempts = student_attempts.reindex(sorted(student_attempts.columns), axis=1)#sorted by year
student_results = student_results.reindex(sorted(student_results.columns), axis=1)

student_zipped = student_attempts.astype('str') + ' ' + student_results 

In [None]:
student_zipped.iloc[14].dropna()

In [None]:
# save the entire row of attempts and results
student_records = {}
for index, row in student_zipped.iterrows():
    courses_on_record = row.dropna().index.tolist()
    results = row.dropna().values.tolist()
    student_records[index] = list(zip(courses_on_record,results))
student_records

In [None]:
#for index, lst in student_records.items():
    

In [None]:
years_dict = {}
years_list = []
num_leave=0
first_and_second_yr_courses = ['LSM','CONAD','CONTRACT','TORT',
                               'LAND','EQUITY','CRIMINAL']
for index, row in student_attempts.iterrows():
    last_entry = row.dropna().index.tolist()[-1]
    if (any(course in last_entry for course in first_and_second_yr_courses)
        & ('201718' not in last_entry)):
        years_dict[index] = (last_entry: student_results.at[index, last_entry])
        years_list.append((last_entry, student_results.at[index, last_entry]))
        num_leave+=1
len(set(years_dict.keys()))

In [None]:
Counter(years_list)

In [None]:
first_year_courses = ['LSM', 'CONTRACT', 'CONAD', 'TORT']
num_quit_after_first_year_course = 0
num_quit_after_second_year_course = 0
for k,v in Counter(years_list).items():
    if any(course in k[0] for course in first_year_courses):
        num_quit_after_first_year_course += v
    else:
        num_quit_after_second_year_course += v

In [None]:
num_quit_after_first_year_course

In [None]:
num_quit_after_second_year_course