## Let's Win Jeopardy!

In [130]:
import pandas as pd
import numpy as np
jeopardy = pd.read_csv("jeopardy.csv")

In [131]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [132]:
print("Columns:\n", jeopardy.columns)

Columns:
 Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


**Fixing column names**

In [133]:
jeopardy.columns = jeopardy.columns.str.strip()

**Normalizing Question/Answer columns**

In [134]:
import string
import re
def normalize(strings):
    strings = strings.lower()
    strings = re.sub("[^A-Za-z0-9\s]", "", strings)
    return strings

In [135]:
#normalize Question column
jeopardy["clean_question"] = jeopardy.Question.apply(normalize)

In [136]:
#normalize Answer column
jeopardy["clean_answer"] = jeopardy.Answer.apply(normalize)

**Normalizing Value column**

In [137]:
def dollar(strings):
    strings = re.sub("[^A-Za-z0-9\s]", "", strings)
    try:
        strings = int(strings)
    except Exception:
        strings = 0
    return strings

In [138]:
jeopardy["clean_value"] = jeopardy.Value.apply(dollar)

**Normalizing Air Date column**

In [139]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

**How often new questions are repeats of older questions?**

In [140]:
def function(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)        

In [141]:
jeopardy["answer_in_question"] = jeopardy.apply(function, axis=1)
jeopardy["answer_in_question"].mean()

0.06049325706933587

The answer appears in the questions only about 6% of the time

**How often new questions are repeats of older ones?**

In [142]:
question_overlap = []
terms_used = set()
sorted_jeo = jeopardy.sort_values("Air Date")
for idx, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    #Filtering out words like "the" or "than" commonly used
    split_question = [x for x in split_question if len(x)>5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

In [143]:
jeopardy['question_overlap'] = question_overlap
jeopardy.question_overlap.mean()

0.6908737315671962

There has been a 69% of question overlaps by just looking at the questions from the shortened dataset.

**High value / Low value questions**

In [144]:
def value_high_low(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

In [145]:
jeopardy["high_value"] = jeopardy.apply(value_high_low, axis=1)

In [146]:
def count_high_low(word):
    high_count = 0
    low_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

**Using only 5 terms**

In [147]:
observed_expected = []
comparison_terms = list(terms_used)
comparison_terms = comparison_terms[:5]

In [148]:
for term in comparison_terms:
    observed_expected.append(count_high_low(term))

In [149]:
observed_expected

[(0, 1), (0, 1), (0, 1), (0, 1), (0, 2)]

**Chi-square**

In [150]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

In [151]:
from scipy.stats import chisquare
chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total/jeopardy.shape[0]
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    observed = np.array([obs[0],obs[1]])
    expected = np.array([exp_high, exp_low])
    chi_squared.append(chisquare(observed, expected))
chi_squared

  terms = (f_obs - f_exp)**2 / f_exp


[Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan)]