# Conditional Class Performance

Basic Modules:

In [48]:
import pandas
import numpy
import time
import copy
from itertools import combinations

#### Import Data File:
The data representation indicates student failing a class (1) or student passing a class (0)

In [49]:
df = pandas.read_csv('JCMA.csv')

print(df.shape)

df[df['EMPLID']==987356].sum()

(2156, 22)


EMPLID         987356.0
CL1                 0.0
CL2                 0.0
CL3                 0.0
CL4                 0.0
CL5                 0.0
CL6                 0.0
CL7                 0.0
CL8                 0.0
CL9                 0.0
CL10                1.0
CL11                0.0
CL12                0.0
CL13                0.0
CL14                0.0
CL15                0.0
CL16                0.0
CL17                0.0
CL18                0.0
CL19                0.0
CL20                0.0
Unnamed: 21         1.0
dtype: float64

In [50]:
df.drop(columns=['Unnamed: 21'],inplace=True)

df.replace(numpy.nan,0,inplace=True)

df = df.astype('int64')

df[df['EMPLID']==987356]

Unnamed: 0,EMPLID,CL1,CL2,CL3,CL4,CL5,CL6,CL7,CL8,CL9,...,CL11,CL12,CL13,CL14,CL15,CL16,CL17,CL18,CL19,CL20
1857,987356,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Create all possible combinations of n classes

In [51]:
n = 1

In [52]:
classes = df.columns[1:]

comb = combinations(classes, n) 

comb_list = list(comb)

print('there are {} possible {}-class combinations'.format(len(comb_list), n))

there are 20 possible 1-class combinations


### Prepare data for processing

###### Fix the EMPLID column name which contains a space inserted at the end for no obvious reason.

In [53]:
df.columns

Index(['EMPLID', 'CL1', 'CL2', 'CL3', 'CL4', 'CL5', 'CL6', 'CL7', 'CL8', 'CL9',
       'CL10', 'CL11', 'CL12', 'CL13', 'CL14', 'CL15', 'CL16', 'CL17', 'CL18',
       'CL19', 'CL20'],
      dtype='object')

In [54]:
df.rename(columns={'EMPLID ':'EMPLID'},inplace=True)

## Engine

In [55]:
def combination_counter(df,l1,l2):
    """
    df: a dataframe containing EMPLID and classes. Class columns must be consecutive.
    l1: first class column name in cluster
    l2: last class column name in cluster
    """
    
    #set up rows,columns,indexes and data to populate adjecency matrix
    data = []
    students = {}
    combo_dict = {}
    
    # iterate through all possible class-combos combos
    for combo in comb_list:
        # pull enrollment indicator for class-combos
        df_tuple = df[['EMPLID'] + list(combo)]
        # ignore class-combos without students in common
        if df_tuple[df_tuple.iloc[:,l1:l2+1].all(axis=1)].shape[0]!=0:
            
            # pull df of all students in common for a given class-combo
            common_stdts_emplid = df_tuple[df_tuple.iloc[:,l1:l2+1].all(axis=1)]

            common_stdts = common_stdts_emplid.drop(columns=['EMPLID'])

            # count students in common for a given class-combo
            common_stdts_cnt = common_stdts.iloc[:,0].sum()

            # store students in common count
            data.append(common_stdts_cnt)

            # store students in common with emplid
            students[combo] = common_stdts_emplid

            # store index class-combos
            combo_dict[combo] = [combo[i] for i in range(len(combo))] + [common_stdts_cnt]
                
    return (combo_dict,students,data)

###### Implement Engine and obtain results

In [56]:
start_time = time.time()
combo_dict, students, data = combination_counter(df,1,n)
end_time = time.time()

print('run took {:0.3f} seconds'.format((end_time-start_time)))

run took 0.050 seconds


In [57]:
print('there are {} class-tuples that have at least one student failing all n classes'.format(len(data)))

print('the maximum number of students enrolled in a class set is {}'.format(max(data)))

there are 20 class-tuples that have at least one student failing all n classes
the maximum number of students enrolled in a class set is 237


In [58]:
combo_data = pandas.DataFrame.from_dict(combo_dict, orient='index')

combo_data.columns = ['class'+str(i+1) for i in range(n)] + ['failed']

combo_data.sort_values(by='failed',ascending=False,inplace=True)

combo_data.head()

Unnamed: 0,class1,failed
"(CL13,)",CL13,237
"(CL2,)",CL2,116
"(CL10,)",CL10,102
"(CL11,)",CL11,99
"(CL12,)",CL12,98


###### Validation:
Look at original dataset and manually count events

In [59]:
val_col = [combo_data.index[0][i] for i in range(len(combo_data.index[0]))]

df[val_col].sort_values(by=val_col,ascending=False).head(combo_data['failed'][0] + 1)

Unnamed: 0,CL13
837,1
1865,1
717,1
171,1
1676,1
...,...
2037,1
1,1
87,1
2074,1


###### Students by EMPLID and Failed Group of Classes

In [60]:
def make_students():
    
    fs = pandas.DataFrame(columns=['EMPLID','failed'])

    for key in range(len(students.keys())):
        temp = students[list(students.keys())[key]][['EMPLID']].copy()
        temp['failed'] = str(list(students.keys())[key])
        fs = pandas.concat([fs,temp])
    return fs

In [61]:
fs = make_students()

fs

Unnamed: 0,EMPLID,failed
37,1002408,"('CL1',)"
49,1002750,"('CL1',)"
56,1003129,"('CL1',)"
82,1003812,"('CL1',)"
127,1005739,"('CL1',)"
...,...,...
1310,919676,"('CL20',)"
1383,939920,"('CL20',)"
1384,939990,"('CL20',)"
1491,959973,"('CL20',)"


In [62]:
fs[fs['EMPLID']==987356]

Unnamed: 0,EMPLID,failed
1857,987356,"('CL10',)"
