# Winning Jeopardy

The dataset is named jeopardy.csv, and contains 20000 rows from the beginning of a full dataset of Jeopardy questions, which you can download [here](https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file).

Columns:
* **Show Number** -- the Jeopardy episode number of the show this question was in.
* **Air Date** -- the date the episode aired.
* **Round** -- the round of Jeopardy that the question was asked in. Jeopardy has several rounds as each episode progresses.
* **Category** -- the category of the question.
* **Value** -- the number of dollars answering the question correctly is worth.
* **Question** -- the text of the question.
* **Answer** -- the text of the answer.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
# remove whitespace before column names

print (jeopardy.columns.values)

import re
regex = r'^\s+'

jeopardy.columns = [re.sub(regex,'',each) for each in jeopardy.columns.values]

print(jeopardy.columns.values)

['Show Number' ' Air Date' ' Round' ' Category' ' Value' ' Question'
 ' Answer']
['Show Number' 'Air Date' 'Round' 'Category' 'Value' 'Question' 'Answer']


In [3]:
def normalize_string(string):
    """converts string to lowercase and strips all punctuation"""
    string = string.lower()
    return re.sub(r'[^a-z0-9\s]','',string)

test_string = "Hello, my name is John.\nDon't get it wrong."
print("Original: {}\n\nNormalized: {}".format(test_string,normalize_string(test_string)))

Original: Hello, my name is John.
Don't get it wrong.

Normalized: hello my name is john
dont get it wrong


In [4]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_string)
jeopardy['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [5]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_string)
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [6]:
def normalize_value(string):
    """
    Removes all punctuation and converts string to integer
    Returns 0 if error occurred
    """
    string = string.lower()
    string = re.sub('[^a-z0-9\s]','',string)
    try:
        return int(string)
    except Exception:
        return 0
    
print(normalize_value('"8.'))
print(normalize_value('"8x.'))

8
0


In [7]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)
jeopardy['clean_value'].head(10)

0    200
1    200
2    200
3    200
4    200
5    200
6    400
7    400
8    400
9    400
Name: clean_value, dtype: int64

In [8]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy["Air Date"])
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400


In [9]:
def count_matches(row):
    """
    Count how many times an Returns the """
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    match_count = sum([1 if each in split_question else 0 for each in split_answer])
    return match_count / len(split_answer)

In [10]:
jeopardy['answer_in_question'] = jeopardy.apply(count_matches,axis=1)
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,0.0
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,0.0
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,0.0
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200,0.0
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200,0.0
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200,0.0
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400,0.0
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400,0.0
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400,0.0
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400,0.333333


In [11]:
print(jeopardy['answer_in_question'].mean())

0.06049325706933587


# Answer terms in the question
Only 6% of the time is the answer contained in the question, therefore we need to study as we can't rely on just figuring it out from the question

In [12]:
question_overlap = []
terms_used = set()

jeopardy.sort_values(by = 'Air Date',ascending='True',inplace=True)

for idx, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [each for each in split_question if len(each) >= 6]
    match_count = 0
    for each in split_question:
        if each in terms_used:
            match_count += 1
        terms_used.add(each)
    if len(split_question) > 1:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.6894031359073245


# Question overlap

About 70% of terms in questions have been used before.  It does not mean that entire question phrases have been used before, but is worth investigating

In [13]:
def high_low_value(row):
    """returns 1 if row['clean_value'] > 800, otherwise returns 0"""
    if row['clean_value'] > 800:
        return 1
    else:
        return 0

jeopardy['high_value'] = jeopardy.apply(high_low_value,axis=1)

In [14]:
def high_low_counts(word):
    """loops through each row and returns the number of times the word appears in a clean_question of a high_value row and low_value row (returns tuple)"""
    high_count = 0
    low_count = 0
    for idx, row in jeopardy.iterrows():
        question_split = row['clean_question'].split(' ' )
        if word in question_split:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [15]:
observed_expected = []
comparison_terms = list(terms_used)[:5]

for each in comparison_terms:
    observed_expected.append(high_low_counts(each))
    
print(observed_expected)

[(1, 0), (0, 1), (13, 14), (2, 4), (0, 1)]


In [24]:
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]

chi_squared = []

from scipy.stats import chisquare

for each_list in observed_expected:
    total = sum(each_list)
    total_prop = total/jeopardy.shape[0]
    exp_high_count = total_prop * high_value_count
    exp_low_count = total_prop * low_value_count
    obs = np.array([each_list[0],each_list[1]])
    exp = np.array([exp_high_count,exp_low_count])
    chi_sq, p_value = chisquare(obs,exp)
    chi_squared.append([chi_sq,p_value])

comparison_terms    


['angling', 'rectifier', 'becomes', 'lightning', 'seriousand']

In [25]:
chi_squared

[[2.487792117195675, 0.11473257634454047],
 [0.401962846126884, 0.5260772985705469],
 [5.0082068758125295, 0.02522742452655571],
 [0.06376233446880725, 0.8006453026878781],
 [0.401962846126884, 0.5260772985705469]]

In [23]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_sq, p_value = chisquare(obs,exp)
    chi_squared.append([chi_sq,p_value])
chi_squared

[[2.487792117195675, 0.11473257634454047],
 [0.401962846126884, 0.5260772985705469],
 [811.2215856469384, 1.9602319793164306e-178],
 [25.382574006812842, 4.70145960342027e-07],
 [0.401962846126884, 0.5260772985705469]]