# MC Frequency Analysis
This notebook contains code to perform the "expected distribution" calculations as part of the Chi-Square stats test procedure.  This is Part 2 (See MC_FREQ_SQL.ipynb for Part 1).  

In this part, we read the data from csv files and then use it to simulate race results for $n=10,000$ seasons.  We are interested in counting how many times each school does the following:<p>
1  Finishes first in state meet <p>
2  Finishes 2nd in state meet <p>
3  ... <p>
20  Finishes 20th in state meet <p>
21  Makes it to Districts, but not Regionals <p>
22  Makes it to Regionals, but not states <p>

We store the results of the simulation in 20 separate CSV files which can be used to do the actual calculation of expected results.  


In [1]:
import pandas as pd
import numpy as np
import math
import random

### Global Parameters
The cell below sets the important global parameters for div, year and gender.  The user can vary these over

- year: 2017-2021
- div: 1,2,3
- gender: 'B', 'G'

In [2]:
year = 2021
gender = 'G'
div = 3

filename = 'D{0}_{1}_{2}.csv'.format(div,gender,year)
df = pd.read_csv(filename,index_col=0)
print(df)

     BLDG_IRN                          NAME  YEAR  DIVISION GENDER      TYPE  \
0       97923        East Dayton Christian   2021         3      G  District   
1      126144  Granville Christian Academy   2021         3      G  District   
2      134619   Emmanuel Christian Academy   2021         3      G  District   
3       13870     Fayette Jr/Sr High School  2021         3      G  District   
4       15172   Hardin Northern High School  2021         3      G  District   
..        ...                           ...   ...       ...    ...       ...   
219     52969              Elyria Catholic   2021         3      G  District   
220     53934                     Ursuline   2021         3      G  District   
221     52993              Gilmour Academy   2021         3      G  District   
222     52993              Gilmour Academy   2021         3      G  Regional   
223     52993              Gilmour Academy   2021         3      G     State   

     PLACE  POPULATION  
0       17    

In [3]:
# Create scoring sheet
IDs = df.BLDG_IRN.to_list()
#print(IDs)
scores = {}
for id in IDs:
    scores[id] = [0 for i in range(22)]
    

In [4]:

def createDistricts (df):
    '''
    This function takes a data frame df as input and returns another dataframe with district
    assignments.  
    The original df must have a column for BLDG_IRN that uniquely identifies the building ID.
    The returned df has the same columns but now adds a column for DISTRICT # which is a number
    between 0 and 11 (12 districts).  These are randomly assigned to schools.  
    '''
    # get list of BLDG_IRN (unique ids)
    bldgs = df['BLDG_IRN'].tolist()
    bldgs = list(dict.fromkeys(bldgs))
    #print(bldgs)
    n = len(bldgs)
    print(n)

    # create random assignments to 12 different districts
    perms = np.random.permutation(n)
    dists = {}
    for i in range(n):
        idx = perms[i]
        dists[bldgs[idx]] = (i % 12)
    #print(dists)

    # add district into dataframe
    df2 = df.copy()
    df2['DISTRICT'] = -1
    df2['REGION'] = -1
    #print(df2)

    for key in dists:
        val = dists[key]
        df2.loc[df2['BLDG_IRN'] == key,'DISTRICT'] = val % 3
        df2.loc[df2['BLDG_IRN'] == key,'REGION'] = (val // 3) 

    #print(df2)
    return df2

def createDict (df):
    '''
    This function takes a dataframe as input.  It returns a dictionary.
    df: must have column BLDG_IRN that is unique
        must have column POPULATION 
    dictionary:
        key = unique BLDG_IRN
        value = population
    '''
    return dict(zip(df.BLDG_IRN, df.POPULATION))

In [5]:
df2 = createDistricts(df)
print(df2)

142
     BLDG_IRN                          NAME  YEAR  DIVISION GENDER      TYPE  \
0       97923        East Dayton Christian   2021         3      G  District   
1      126144  Granville Christian Academy   2021         3      G  District   
2      134619   Emmanuel Christian Academy   2021         3      G  District   
3       13870     Fayette Jr/Sr High School  2021         3      G  District   
4       15172   Hardin Northern High School  2021         3      G  District   
..        ...                           ...   ...       ...    ...       ...   
219     52969              Elyria Catholic   2021         3      G  District   
220     53934                     Ursuline   2021         3      G  District   
221     52993              Gilmour Academy   2021         3      G  District   
222     52993              Gilmour Academy   2021         3      G  Regional   
223     52993              Gilmour Academy   2021         3      G     State   

     PLACE  POPULATION  DISTRICT  R

In [6]:
def permSelect (dct):
    '''
    This algorithm performs a weighted selection of a permutation of items.
    The dct parameter is a dictionary
        keys = unique IDs
        values = weights with those IDs
    The return value is a permutation (list) of the keys in an order uniquely determined by
    the weights.  
    '''
    df = pd.DataFrame.from_dict(dct, orient='index')
    df.columns = ['w']
    
    p = []  # empty permutation
    count = 0
    while len(df) > 0:
        total = df.w.sum()
        idx = random.randint(1,total)
        #print('iteration ',count)
        count = count + 1
        #print(df)
        #print('total/idx = ',total,idx)
        k = 0
        while k < len(df) and idx > df.w.iloc[k]:
            idx = idx - df.w.iloc[k]
            k = k + 1
        idx = df.index[k]
        p.append(idx)
        df = df.drop([idx])
        #print('p =',p)

    return p

In [7]:
# This code will simulate one season of competition for the schools in df
def oneSeason (df,scores):
    '''
    The input is a dataframe that must have columns for
    BLDG_IRN - the unique building id
    POPULATON - the number of students in the population
    
    The scores is a dictionary with BLDG_IRN as the key 
    and the value is an array of 22 integers.  Depending on where
    a team ends up, they increment the count in the appropriate array spot:
        spots 0 to 19 (places 1 to 20 in state meet)
        spot 20 (stopped at regionals)
        spot 21 (stopped at states)
    '''
    st_dct = {}
    for region in range(4):
        rg_dct = {}
        for district in range(3):
            # select teams from correct region and district
            dftemp = df2.loc[(df2['REGION']==region)&(df2['DISTRICT']==district)]
            #print(region,district,dftemp)
            
            # simulate district race
            dct = createDict (dftemp)
            #print(region,district,dct)
            res = permSelect(dct)
            #print(region,district,'results =',res)
            
            # top five go through to regionals
            for i in range(5):
                id = res[i]
                rg_dct[id] = dct[id]
            
            # the rest get score 21 (stop at districts)
            for id in res[5:]:
                scores[id][21] += 1
        # now have regional race
        #print("regional race",region,rg_dct)
        res = permSelect(rg_dct)
        #print(region,'*results =',res)
        
        # top five go to states
        for i in range(5):
            id = res[i]
            st_dct[id] = rg_dct[id]
        
        # rest of score of 20 (stop at regionals)
        for id in res[5:]:
            scores[id][20] += 1
        
    # now simulate state meet
    res = permSelect(st_dct)
    #print('***results =',res)
    for i in range(len(res)):
        id = res[i]
        scores[id][i] += 1
        
            
            

In [8]:
# This is the main block of code which runs the simulation

n = 10000   # n=10,000 seasons of simulation
for i in range(n):
    if i % 100 == 0:
        print('iteration:',i)
    oneSeason(df2,scores)
#print(scores)

iteration: 0


In [10]:
# Record the resulst in the corresponding .txt file of the same name.

filename = 'D{0}_{1}_{2}.txt'.format(div,gender,year)
fp = open(filename,'w')
for ID in scores:
    fp.write('{0:6d} '.format(ID))
    print('ID: ',ID)
    df3 = df[((df.BLDG_IRN == ID) & (df.TYPE == 'District'))]
    print(df3)
    pop = int(df3.POPULATION)
    fp.write('{0:8d} '.format(pop))
    for i in range(22):
        fp.write('{0:7d} '.format(scores[ID][i]))
    fp.write('\n')
fp.close()


ID:  97923
   BLDG_IRN                    NAME  YEAR  DIVISION GENDER      TYPE  PLACE  \
0     97923  East Dayton Christian   2021         3      G  District     17   

   POPULATION  
0          79  
ID:  126144
   BLDG_IRN                          NAME  YEAR  DIVISION GENDER      TYPE  \
1    126144  Granville Christian Academy   2021         3      G  District   

   PLACE  POPULATION  
1      9          86  
ID:  134619
   BLDG_IRN                         NAME  YEAR  DIVISION GENDER      TYPE  \
2    134619  Emmanuel Christian Academy   2021         3      G  District   

   PLACE  POPULATION  
2     14         104  
ID:  13870
   BLDG_IRN                       NAME  YEAR  DIVISION GENDER      TYPE  \
3     13870  Fayette Jr/Sr High School  2021         3      G  District   

   PLACE  POPULATION  
3      5         109  
ID:  15172
   BLDG_IRN                         NAME  YEAR  DIVISION GENDER      TYPE  \
4     15172  Hardin Northern High School  2021         3      G  District 