# National Circuit Public Forum Gender Bias (2020-2021 Season)
*by Samarth Chitgopekar, github: http-samc*

## Start off by importing our dependencies: plotly (for graphs), json (to read our compiled data), and nltk (analyze gender)

In [10]:
import json
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import random
import nltk
from nltk.corpus import names

## Jupyter Setup

In [11]:
init_notebook_mode(connected=True)

## Read our data from `../data/2020-21 MASTER.json` into an in-memory dictionary

In [12]:
with open("../data/2020-21 MASTER.json", 'r') as f:
    data = json.loads(f.read())

## Train NLTK

In [13]:
MALE: str = "MALE"
FEMALE: str = "FEMALE"

def gender_features(word):
    return {'last_letter':word[-1]}

labeled_names = ([(name, MALE) for name in names.words('male.txt')]+
             [(name, FEMALE) for name in names.words('female.txt')])

random.shuffle(labeled_names)

featuresets = [(gender_features(n), gender)
               for (n, gender)in labeled_names]

train_set, test_set = featuresets[500:], featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

f"Classification Accuracy with known dataset of {round(nltk.classify.accuracy(classifier, train_set)*100, 3)}%"

'Classification Accuracy with known dataset of 76.169%'

## Define a function to classify individuals as male or female

In [14]:
def classifyGender(name: str) -> str:
    """Uses trained dataset ( > 70% acc. ) to classify an input name
    as either male or female.

    Args:
        name (str): the first name to test

    Returns:
        str: either the constants MALE or FEMALE
    """

    return classifier.classify(gender_features(name))

## Create our independent data (gender) and dependent data (adj. avg speaker points)

In [15]:
independentDataList_M: list = []
dependentDataList_M: list = []

independentDataList_F: list = []
dependentDataList_F: list = []

for team in data.values():

    otrScore = team["otrScore"]

    speaker1 = None
    speaker2 = None

    speaker1Scores = []
    speaker2Scores = []

    for tournament in team["tournaments"]:
        for speaker in tournament["speaks"]:

            adjAvg = speaker["adjAVG"]
            name = speaker["name"]

            if not adjAvg or not name: continue

            name = name.split(' ')[0]

            if not speaker1:
                speaker1 = name

            if not speaker2:
                speaker2 = name

            if name == speaker1:
                speaker1Scores.append(adjAvg)
            else:
                speaker2Scores.append(adjAvg)

    if not len(speaker1Scores) or not len(speaker2Scores): continue

    speaker1Avg = sum(speaker1Scores)/len(speaker1Scores)
    speaker2Avg = sum(speaker2Scores)/len(speaker2Scores)

    for speaker, score in [[speaker1, speaker1Avg], [speaker2, speaker2Avg]]:
        if classifyGender(speaker) == MALE:
            independentDataList_M.append('Male')
            dependentDataList_M.append(score)

        else:
            independentDataList_F.append('Female')
            dependentDataList_F.append(score)

## Find our averages

In [16]:
scoresM = np.array(dependentDataList_M)
scoresF = np.array(dependentDataList_F)

m_1 = np.percentile(scoresM, 1)
f_1 = np.percentile(scoresF, 1)

m_25 = np.percentile(scoresM, 25)
f_25 = np.percentile(scoresF, 25)

m_50 = np.percentile(scoresM, 50)
f_50 = np.percentile(scoresF, 50)

m_75 = np.percentile(scoresM, 75)
f_75 = np.percentile(scoresF, 75)

m_99 = np.percentile(scoresM, 99)
f_99 = np.percentile(scoresF, 99)

delta = {}
for i in range(1, 100):
    delta[i] = np.percentile(scoresM, i) - np.percentile(scoresF, i)

delta = dict(sorted(delta.items(), key=lambda item: item[1]))

## Create our scatter plot with Plotly

In [17]:
layout = go.Layout(
    title = "Gender vs. Average Adjusted Speaker Points: 2020-21 VPF National Circuit",
    xaxis = {"title": "OTR Score"},
    yaxis = {"title": "Average Adjusted Speaker Points"},
)

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(
    x = independentDataList_M,
    y = dependentDataList_M,
    mode = 'markers',
    name = "Competing Male Team"
));

fig.add_trace(go.Scatter(
    x = independentDataList_F,
    y = dependentDataList_F,
    mode = 'markers',
    name = "Competing Female Team"
));

## Add in average percentile points

In [18]:
fig.add_trace(go.Scatter(
    x = ['Male'],
    y = [m_1],
    mode = 'markers',
    name = f"Male 1 pct. Adj. Speaker Points ({round(m_1, 3)})",
    marker=dict(
        color='LightSkyBlue',
        size=10,
        line=dict(
            color='MediumPurple',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Female'],
    y = [f_1],
    mode = 'markers',
    name = f"Female 1 pct. Adj. Speaker Points ({round(f_1, 3)})",
    marker=dict(
        color='Cyan',
        size=10,
        line=dict(
            color='Green',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Male'],
    y = [m_25],
    mode = 'markers',
    name = f"Male 25 pct. Adj. Speaker Points ({round(m_25, 3)})",
    marker=dict(
        color='LightSkyBlue',
        size=10,
        line=dict(
            color='MediumPurple',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Female'],
    y = [f_25],
    mode = 'markers',
    name = f"Female 25 pct. Adj. Speaker Points ({round(f_25, 3)})",
    marker=dict(
        color='Cyan',
        size=10,
        line=dict(
            color='Green',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Male'],
    y = [m_50],
    mode = 'markers',
    name = f"Male 50 pct. Adj. Speaker Points ({round(m_50, 3)})",
    marker=dict(
        color='LightSkyBlue',
        size=10,
        line=dict(
            color='MediumPurple',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Female'],
    y = [f_50],
    mode = 'markers',
    name = f"Female 50 pct. Adj. Speaker Points ({round(f_50, 3)})",
    marker=dict(
        color='Cyan',
        size=10,
        line=dict(
            color='Green',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Male'],
    y = [m_75],
    mode = 'markers',
    name = f"Male 75 pct. Adj. Speaker Points ({round(m_75, 3)})",
    marker=dict(
        color='LightSkyBlue',
        size=10,
        line=dict(
            color='MediumPurple',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Female'],
    y = [f_75],
    mode = 'markers',
    name = f"Female 75 pct. Adj. Speaker Points ({round(f_75, 3)})",
    marker=dict(
        color='Cyan',
        size=10,
        line=dict(
            color='Green',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Male'],
    y = [m_99],
    mode = 'markers',
    name = f"Male 95 pct. Adj. Speaker Points ({round(m_99, 3)})",
    marker=dict(
        color='LightSkyBlue',
        size=10,
        line=dict(
            color='MediumPurple',
            width=2
        )
    )
))

fig.add_trace(go.Scatter(
    x = ['Female'],
    y = [f_99],
    mode = 'markers',
    name = f"Female 99 pct. Adj. Speaker Points ({round(f_99, 3)})",
    marker=dict(
        color='Cyan',
        size=10,
        line=dict(
            color='Green',
            width=2
        )
    )
))