# Conditional Class Performance

Basic Modules:

In [1]:
import pandas
import numpy
import time
import copy
from itertools import combinations

#### Import Data File:
The data representation indicates student failing a class (1) or student passing a class (0)

In [2]:
df = pandas.read_csv('JCMA.csv')

print(df.shape)

df.head()

(2156, 22)


Unnamed: 0,EMPLID,CL1,CL2,CL3,CL4,CL5,CL6,CL7,CL8,CL9,...,CL12,CL13,CL14,CL15,CL16,CL17,CL18,CL19,CL20,Unnamed: 21
0,1000002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
1,1000103,,1.0,,,,,,,,...,0.0,1.0,1.0,,,,,,,3
2,1000138,0.0,,,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
3,1000141,0.0,,,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
4,1000249,0.0,,,,,1.0,0.0,,,...,0.0,,,0.0,,,,,,1


In [3]:
df.drop(columns=['Unnamed: 21'],inplace=True)

df.replace(numpy.nan,0,inplace=True)

df = df.astype('int64')

df.head()

Unnamed: 0,EMPLID,CL1,CL2,CL3,CL4,CL5,CL6,CL7,CL8,CL9,...,CL11,CL12,CL13,CL14,CL15,CL16,CL17,CL18,CL19,CL20
0,1000002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000103,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,1000138,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000141,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000249,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Create all possible combinations of n classes

In [5]:
n = 2

In [6]:
classes = df.columns[1:]

comb = combinations(classes, n) 

comb_list = list(comb)

print('there are {} possible {}-class combinations'.format(len(comb_list), n))

there are 190 possible 2-class combinations


### Prepare data for processing

###### Fix the EMPLID column name which contains a space inserted at the end for no obvious reason.

In [7]:
df.columns

Index(['EMPLID', 'CL1', 'CL2', 'CL3', 'CL4', 'CL5', 'CL6', 'CL7', 'CL8', 'CL9',
       'CL10', 'CL11', 'CL12', 'CL13', 'CL14', 'CL15', 'CL16', 'CL17', 'CL18',
       'CL19', 'CL20'],
      dtype='object')

In [8]:
df.rename(columns={'EMPLID ':'EMPLID'},inplace=True)

## Engine

In [9]:
def combination_counter(df,l1,l2):
    """
    df: a dataframe containing EMPLID and classes. Class columns must be consecutive.
    l1: first class column name in cluster
    l2: last class column name in cluster
    """
    
    #set up rows,columns,indexes and data to populate adjecency matrix
    data = []
    students = {}
    combo_dict = {}
    
    # iterate through all possible class-combos combos
    for combo in comb_list:
        # pull enrollment indicator for class-combos
        df_tuple = df[['EMPLID'] + list(combo)]
        # ignore class-combos without students in common
        if df_tuple[df_tuple.iloc[:,l1:l2+1].all(axis=1)].shape[0]!=0:
            
            # pull df of all students in common for a given class-combo
            common_stdts_emplid = df_tuple[df_tuple.iloc[:,l1:l2+1].all(axis=1)]

            common_stdts = common_stdts_emplid.drop(columns=['EMPLID'])

            # count students in common for a given class-combo
            common_stdts_cnt = common_stdts.iloc[:,0].sum()

            # store students in common count
            data.append(common_stdts_cnt)

            # store students in common with emplid
            students[combo] = common_stdts_emplid

            # store index class-combos
            combo_dict[combo] = [combo[i] for i in range(len(combo))] + [common_stdts_cnt]
                
    return (combo_dict,students,data)

###### Implement Engine and obtain results

In [10]:
start_time = time.time()
combo_dict, students, data = combination_counter(df,1,n)
end_time = time.time()

print('run took {:0.3f} seconds'.format((end_time-start_time)))

run took 0.251 seconds


In [11]:
print('there are {} class-tuples that have at least one student failing all n classes'.format(len(data)))

print('the maximum number of students enrolled in a class set is {}'.format(max(data)))

there are 102 class-tuples that have at least one student failing all n classes
the maximum number of students enrolled in a class set is 61


In [12]:
combo_data = pandas.DataFrame.from_dict(combo_dict, orient='index')

combo_data.columns = ['class'+str(i+1) for i in range(n)] + ['failed']

combo_data.sort_values(by='failed',ascending=False,inplace=True)

combo_data.head()

Unnamed: 0,class1,class2,failed
"(CL10, CL11)",CL10,CL11,61
"(CL12, CL19)",CL12,CL19,49
"(CL2, CL13)",CL2,CL13,47
"(CL13, CL16)",CL13,CL16,26
"(CL2, CL16)",CL2,CL16,25


###### Validation:
Look at original dataset and manually count events

In [13]:
# val_col = [combo_data.index[0][i] for i in range(len(combo_data.index[0]))]

# df[val_col].sort_values(by=val_col,ascending=False).head(combo_data['failed'][0] + 1)

###### Students by EMPLID and Failed Group of Classes

In [14]:
def make_students():
    
    fs = pandas.DataFrame(columns=['EMPLID','failed'])

    for key in range(len(students.keys())):
        temp = students[list(students.keys())[key]][['EMPLID']].copy()
        temp['failed'] = str(list(students.keys())[key])
        fs = pandas.concat([fs,temp])
    return fs

In [15]:
fs = make_students()

fs

Unnamed: 0,EMPLID,failed
748,1028608,"('CL1', 'CL2')"
1814,984701,"('CL1', 'CL2')"
2109,998315,"('CL1', 'CL2')"
129,1005783,"('CL1', 'CL3')"
216,1007641,"('CL1', 'CL3')"
...,...,...
1015,753544,"('CL18', 'CL19')"
1425,947918,"('CL18', 'CL19')"
1710,978340,"('CL18', 'CL19')"
2086,997241,"('CL18', 'CL19')"
