# Jeopardy - Statistical Significance

We are looking for any edge to win Jeopardy game. We will use a dataset contining 20,000 rows of Jeopardy questions to figure out some patters in the questions that could help us win.

In [24]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")

In [25]:
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [26]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [27]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [28]:
jeopardy["clean_question"] = jeopardy["Question"].str.replace("[^A-Za-z\d\s]", "").str.lower()
jeopardy["clean_answer"] = jeopardy["Answer"].str.replace("[^A-Za-z\d\s]", "").str.lower()

In [29]:
jeopardy["clean_value"] = pd.to_numeric(jeopardy["Value"].str.replace("[^A-Za-z\d\s]", ""), errors="coerce")

In [30]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [31]:
def answer_in_question(row):
    #Leave split method empty to ignore empty string and consecutive white space
    split_answer = row["clean_answer"].split()
    split_question = row["clean_question"].split()
    if "the" in split_answer:
        split_answer.remove("the")
    match_count = 0
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [32]:
jeopardy["answer_in_question"] = jeopardy.apply(answer_in_question, axis=1)
print(jeopardy["answer_in_question"].mean())

0.05900196524977763


This isn't a reliable method because only approximately 6% of the answers are contained in the question.

In [33]:
question_overlap = []
terms_used = set()

jeopardy.sort_values(by=["Air Date"], inplace=True)

for index, row in jeopardy.iterrows():
    clean_split_q = row["clean_question"].split(" ")
    clean_split_q = [word for word in clean_split_q if len(word) > 5]
    counter = 0
    for word in clean_split_q:
        if word in terms_used:
            counter += 1
    for word in clean_split_q:
        terms_used.add(word)
    if len(clean_split_q) > 0:
        counter = counter / len(clean_split_q)
    question_overlap.append(counter)

jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())

0.6876260592169802


Based on the dataset, on average 69% of the words in each question had been used in previous Jeopardy questions. Although having similar words doesn't mean same exact question. This information might just be useful to narrow down the "topic" that's frequently asked in Jeopardy, but not the specific questions.

In [48]:
def highorlow(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(highorlow, axis=1)

In [56]:
def highlowcount(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_q = row["clean_question"].split(" ")
        if word in split_q:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:20]

for term in comparison_terms:
    observed_expected.append(highlowcount(term))
    
for num, i in enumerate(observed_expected):
    print(comparison_terms[num])
    print(i)

smiles
(1, 0)
verizons
(1, 0)
renaming
(0, 1)
melodies
(0, 1)
dekalb
(0, 1)
gleason
(0, 1)
elaine
(0, 3)
carrots
(0, 6)
callaways
(0, 1)
astrologists
(0, 1)
budgerigar
(0, 1)
triatomic
(0, 1)
mesilla
(0, 1)
verdon
(1, 0)
swissstyle
(1, 0)
propelling
(0, 1)
prestige
(0, 1)
improper
(0, 1)
scrapped
(1, 0)
grounds
(3, 3)


In [75]:
from scipy.stats import chisquare

#total number of high value questions in database
high_value_count = len(jeopardy[jeopardy["high_value"] == 1])

#total number of low value questions in database
low_value_count = len(jeopardy[jeopardy["high_value"] == 0])

chi_squared = []

for i in observed_expected:
    #number of questions using the term
    total = sum(i)
    
    #ratio of questions in jeopardy using the term
    prop_total = total / len(jeopardy)
    
    #expected number of High-Value questions using the term
    e_high_val = prop_total * high_value_count
    
    #expected number of Low-Value questions using the term
    e_low_val = prop_total * low_value_count
    
    #chi square of term's HighValue and LowValue count
    chi_squared.append(chisquare([i[0], i[1]], [e_high_val, e_low_val]))
    
chi_squared
    

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=1.205888538380652, pvalue=0.27214791766902047),
 Power_divergenceResult(statistic=2.411777076761304, pvalue=0.120425590069509),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(st

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.