## Winning Jeopardy

In [1]:
import pandas as pd

jep = pd.read_csv("jeopardy.csv")
jep.head()


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
jep.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

#### Fixing column names

In [3]:
jep.rename(columns={' Air Date':'Air Date', ' Round':'Round', ' Category':'Category', 
                    ' Value':'Value',' Question':'Question', ' Answer':'Answer'}, 
           inplace=True)
jep.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

#### Question and answer: Converting to lowercase and removing punctuation

In [4]:
jep['clean_answer'] = jep['Answer'].str.lower()
jep['clean_question'] = jep['Question'].str.lower()

import string
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s

jep['clean_answer'] = jep['clean_answer'].apply(remove_punctuation)
jep['clean_question'] = jep['clean_question'].apply(remove_punctuation)

#### Value: removing dollar sign and converting to int

In [5]:
def fix_val_col(v):
    v = ''.join([i for i in v if i not in frozenset(string.punctuation)])
    try:
        v = int(v)
    except Exception:
        v = 0
    return v

jep['clean_value'] = jep['Value'].apply(fix_val_col)

#### Air date: Converting to datetime type

In [6]:
jep['Air Date'] = pd.to_datetime(jep['Air Date'])
jep.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_answer              object
clean_question            object
clean_value                int64
dtype: object

In [7]:
def count_qa_matches(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    match_count = float(0.0)
    if 'the' in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for sa in split_answer:
        if sa in split_question:
            match_count+=1
    return match_count/len(split_answer)

jep['answer_in_question'] = jep.apply(count_qa_matches, axis=1)

In [8]:
print(jep['answer_in_question'].mean())

0.0603527738547


## Are answers in past questions?
#### Answers were only found in the questions 6% of the time. Therefore, it would be wise to study material.

In [9]:
jep.sort_values("Air Date", ascending=True, inplace=True)
terms_used = set()
question_overlap = []

for index, row in jep.iterrows():
    match_count = 0
    split_question = row['clean_question'].split(" ")
    split_question = [word for word in split_question if len(word)>5]
    for word in split_question:
        if word in terms_used:
            match_count+=1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

jep['question_overlap'] = question_overlap
print(jep['question_overlap'].mean())


0.687124288097


### Are past questions recycled?
#### 68.7% of words were recycled from past questions. This only looks at a small set of questions, and it doesn't look at phrases, making it relatively insignificant.  The recycling of questions needs further investigation.

In [10]:
def high_or_low_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

jep['high_value'] = jep.apply(high_or_low_value, axis=1)

def find_word_value(word):
    low_count = 0
    high_count = 0
    
    for index, row in jep.iterrows():
        split_question = row['clean_question'].split(" ")
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count
        
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]

for term in comparison_terms:
    observed_expected.append(find_word_value(term))
    
print(observed_expected)

[(0, 1), (0, 2), (4, 1), (4, 10), (0, 1)]


In [22]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jep[jep["high_value"] == 1].shape[0]
low_value_count = jep[jep["high_value"] == 1].shape[0]
chi_squared = []

for lst in observed_expected:
    total_word_appearances = sum(lst)
    total_wa_proportion = total_word_appearances / jep.shape[0]
    exp_high_val = total_wa_proportion * high_value_count
    exp_low_val = total_wa_proportion * low_value_count
    observed = np.array([lst[0], lst[1]])
    expected = np.array([exp_high_val, exp_low_val])
    chi_squared.append(chisquare(observed, expected))

chi_squared
    

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=6.4413273442515369, pvalue=0.011149530589799395),
 Power_divergenceResult(statistic=6.8463415656923036e-05, pvalue=0.993398169235444),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]

Only one P-value was significant, suggesting that most words are not more likely to appear in high value questions. Frequencies of less than five on four of the chi-square values makes the validity questionable.

Further tests are warranted on words with higher frequencies