# Illinois High School State Association Debate Public Forum Gender Bias (2019 & 2021)
*by Samarth Chitgopekar, github: http-samc*

## Start off by importing our dependencies: plotly (for graphs), json (to read our compiled data), and nltk (analyze gender)

In [1]:
import json
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import random
import nltk
from nltk.corpus import names

## Jupyter Setup

In [2]:
init_notebook_mode(connected=True)

## Train NLTK

In [3]:
MALE: str = "MALE"
FEMALE: str = "FEMALE"

def gender_features(word):
    return {'last_letter':word[-1]}

labeled_names = ([(name, MALE) for name in names.words('male.txt')]+
             [(name, FEMALE) for name in names.words('female.txt')])

random.shuffle(labeled_names)

featuresets = [(gender_features(n), gender)
               for (n, gender)in labeled_names]

train_set, test_set = featuresets[500:], featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

f"Classification Accuracy with known dataset of {round(nltk.classify.accuracy(classifier, train_set)*100, 3)}%"

'Classification Accuracy with known dataset of 76.424%'

## Define a function to classify individuals as male or female

In [4]:
def classifyGender(name: str) -> str:
    """Uses trained dataset ( > 70% acc. ) to classify an input name
    as either male or female.

    Args:
        name (str): the first name to test

    Returns:
        str: either the constants MALE or FEMALE
    """

    return classifier.classify(gender_features(name))

## Read IHSA State Data (2019 & 2021)

In [5]:
with open("../data/IHSA 2019.json", "r") as f:
    data_2019 = json.loads(f.read())

with open("../data/IHSA 2021.json", "r") as f:
    data_2021 = json.loads(f.read())

## Defining constants for team makeup

In [6]:
MALE = "mm"
FEMALE = "ww"
MIXED = "mw"

MALES = []
FEMALES = []
MIXEDS = []

## Define function that uses our gender classifer and constants to determine the gender makeup of a team

In [7]:
def generateGenders(data: dict) -> dict:
    for team in data:
        name1 = data[team]["names"][0].split()[-1]
        name2 = data[team]["names"][1].split()[-1]

        # 0 -> female, 1 -> male
        gender1 = 1 if classifyGender(name1) == "MALE" else 0
        gender2 = 1 if classifyGender(name2) == "MALE" else 0

        if gender1 == gender2 and gender1 == 1:
            genders = MALE
        elif gender1 == gender2 and gender2 == 0:
            genders = FEMALE
        elif gender1 != gender2:
            genders = MIXED

        data[team]["genders"] = genders

    return data

## Calculates win pct for each team that competed and adds to a specific total based on the gender makeup of the team.

In [8]:
def analyzeTournaments(data: dict) -> None:
    for team in data:
        genders = data[team]["genders"]

        results = []

        for round_ in data[team]["rounds"]:

            opp = round_["opp"]

            res = round_["win"]
            if res == True:
                res = 1
            elif res == False:
                res = 0
            else:
                res = 0.5

            results.append(res)

        wpm = sum(results)/len(results)

        if genders == MALE:
            MALES.append(wpm)
        elif genders == FEMALE: FEMALES.append(wpm)
        else: MIXEDS.append(wpm)


## Calculate means and WPM at various percentiles

In [9]:
analyzeTournaments(generateGenders(data_2019))
analyzeTournaments(generateGenders(data_2021))

MALE_WPM = round(100*sum(MALES)/len(MALES), 4)
FEMALE_WPM = round(100*sum(FEMALES)/len(FEMALES), 3)
MIXED_WPM = round(100*sum(MIXEDS)/len(MIXEDS), 4)

scoresMALE = np.array(MALES)
scoresFEMALE = np.array(FEMALES)
scoresMIXED = np.array(MIXEDS)

m_5 = 100*round(np.percentile(scoresMALE, 5), 3)
f_5 = 100*round(np.percentile(scoresFEMALE, 5), 3)
mi_5 = 100*round(np.percentile(scoresMIXED, 5), 3)

m_25 = 100*round(np.percentile(scoresMALE, 25), 3)
f_25 = 100*round(np.percentile(scoresFEMALE, 25), 3)
mi_25 = 100*round(np.percentile(scoresMIXED, 25), 3)

m_75 = 100*round(np.percentile(scoresMALE, 75), 3)
f_75 = 100*round(np.percentile(scoresFEMALE, 75), 3)
mi_75 = 100*round(np.percentile(scoresMIXED, 75), 3)

m_95 = 100*round(np.percentile(scoresMALE, 95), 3)
f_95 = 100*round(np.percentile(scoresFEMALE, 95), 3)
mi_95 = 100*round(np.percentile(scoresMIXED, 95), 3)

## Creating statistics string

In [10]:
stats = f"""
Dataset Statistics:<br>
---<br>
Total Number of Teams: {len(list(data_2019.keys())) + len(list(data_2021.keys()))}<br>
Total Number of Rounds: {len(MALES) + len(FEMALES) + len(MIXEDS)}<br>
Margin of Error: +/- 2%<br>
<br>
Win Percentages by Gender Group:<br>
---<br>
All-Male WPM: {MALE_WPM}%<br>
All-Female WPM: {FEMALE_WPM}%<br>
Mixed WPM: {MIXED_WPM}%<br>
<br>
Pct. WPM by Gender (5, 25, 75, 95):<br>
---<br>
Male: {m_5}%, {m_25}%, {m_75}%, {m_95}%<br>
Female: {f_5}%, {f_25}%, {f_75}%, {f_95}%<br>
Mixed: {mi_5}%, {mi_25}%, {mi_75}%, {mi_95}%<br>
"""

## Plotting graph with Plotly and printing stats

In [11]:
layout = go.Layout(
    title = "Gender vs. Mean Win Percentage: IHSA State (2019 & 2021)",
    xaxis = {"title": "Gender"},
    yaxis = {"title": "Mean Win Percentage"},
    annotations=[
            go.layout.Annotation(
                text=stats,
                align='left',
                showarrow=False,
                xref='paper',
                yref='paper',
                x=1.259,
                y=-.2725,
                bordercolor='black',
                borderwidth=1
            )
        ]
)

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(
    x = ["Male-Male Team"] * len(MALES),
    y = MALES,
    mode = 'markers',
    name = "Competing All-Male Team"
));

fig.add_trace(go.Scatter(
    x = ["Female-Female Team"] * len(FEMALES),
    y = FEMALES,
    mode = 'markers',
    name = "Competing All-Female Team"
));

fig.add_trace(go.Scatter(
    x = ["Male-Female Team"] * len(MIXEDS),
    y = MIXEDS,
    mode = 'markers',
    name = "Competing Male-Female Team"
));


iplot(fig)