In [2]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [5]:
import re

# This function will take in a string and return the lower case version of that string
def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text


In [6]:
# Convert the question and answer fields to lower case and assign to new fields
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)


In [7]:
# This function takes in string and if it's a number converts it to an int, otherwise it returns 0
def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text


In [8]:
# Convert value field to int and assign to clean_value
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [9]:
# Convert Air Date field to date/time
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [10]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [11]:
# This function takes in a row and checks to see if parts of the answer are in the question
def check_ans_in_question (row):
    # Split the clean answer and question fields on the space character
    split_answer =row["clean_answer"].split(" ")
    split_question =row["clean_question"].split(" ")
    # Remove the word "the" from split answer
    if "the" in split_answer:
        split_answer.remove("the")
    # Return 0 if split answer is empty    
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for ans in split_answer:
        if ans in split_question:
            match_count += 1
    return match_count/len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(check_ans_in_question, axis=1)

 

In [12]:
jeopardy["answer_in_question"].mean()

0.060493257069335872

Questions in Answer:
In only 6% of the questions, the answer appears in the question. This percentage means that using the terms in the questions to derive the answers is a flawed strategy.  

In [13]:
jeopardy.sort("Air Date")

  if __name__ == '__main__':


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.000000
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.000000
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.000000
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.000000
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.000000
19305,10,1984-09-21,Double Jeopardy!,HOMONYMS,$200,Hindu hierarchy or a play's actors,a caste (cast),hindu hierarchy or a plays actors,a caste cast,200,0.333333
19306,10,1984-09-21,Double Jeopardy!,TV TRIVIA,$200,"Last season, this series mourned the loss of S...",Hill Street Blues,last season this series mourned the loss of sg...,hill street blues,200,0.000000
19307,10,1984-09-21,Double Jeopardy!,1789,$400,Why April 28th was a bad day for Capt. Bligh,the day of the mutiny on the Bounty,why april 28th was a bad day for capt bligh,the day of the mutiny on the bounty,400,0.142857
19308,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$400,Seaside resort that has a monopoly on East Coa...,"Atlantic City, New Jersey",seaside resort that has a monopoly on east coa...,atlantic city new jersey,400,0.000000
19309,10,1984-09-21,Double Jeopardy!,LITERATURE,$400,"He wrote ""The 3 Musketeers""; his son wrote ""Ca...",(Alexandre) Dumas,he wrote the 3 musketeers his son wrote camille,alexandre dumas,400,0.000000


In [30]:
question_overlap = [] 
terms_used= set()
for i, row in jeopardy.iterrows():
    # Split the question on the space character
    split_question = row["clean_question"].split(" ")
    # Remove any words that are less than 6 characters long
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    # Loop thru each word, if it is in terms_used, add 1 to match_count
    for word in split_question:
        if word in terms_used:
            match_count += 1
    # Loop thru and add each word to terms_used set        
    for word in split_question:
        terms_used.add(word)   
    # If length of split_question is greater than 0, divide match_count by length        
    if len(split_question) > 0:
        match_count /= len(split_question)
    # Add # of matches to question overlap
    question_overlap.append(match_count)
# Append question overlap to jeopardy field question overlap
jeopardy["question_overlap"] = question_overlap
# Get the mean
jeopardy["question_overlap"].mean()    
        

0.69087373156719623

Question Overlap: 
There is a 69% overlap in words within questions. Since this looks at words, and not phrases or even entire question, this data is suspect. 

In [28]:
# This function takes in a row and determines if it is a high value ( > 800) question or a low value question (<=800)
def determine_value (row):
    if row["clean_value"] > 800:
        value = 1 # High value
    elif row["clean_value"] <= 800:
        value = 0 # Low value
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis = 1) 

In [29]:
# This function takes in a word and counts how many times it is on a high value question and how many times it is on a low value question.
def count_value (word):
    low_count = 0
    high_count = 0
    # Loop thru each row in jeopardy
    for i, row in jeopardy.iterrows():
        # if word is in clean question, increment high or low count based on what the value is
        if word in  row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else :
                low_count +=1               
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(count_value(term))
    
observed_expected

[(0, 1), (1, 0), (2, 0), (2, 14), (2, 3)]

In [45]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=2.0459928943532475, pvalue=0.15260738863448364),
 Power_divergenceResult(statistic=0.31376681678493112, pvalue=0.57537786229446908)]

There is not much difference between the observed and expected values, the frequency is lower than 5. So we should not assume we will increase our chances of winning by studyng only high or low value questions.  