# Initialization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

# Data sanitizing

In [4]:
# Some column names have spaces in front, trim the spaces.
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [5]:
# Normalize 'Question' and 'Answer' columns
import re

def normalize_str(s):
    _s = s.lower()
    _s = re.sub("[^a-zA-Z0-9\s]", "", _s)
    return _s

jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_str)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_str)

In [6]:
# Normalize 'Value' column, turning into numerical for better manipulation
def normalize_val(s):
    _s = s.lower()
    _s = re.sub("[^a-zA-Z0-9\s]", "", _s)
    try:
        return int(_s)
    except Exception:
        return 0
    
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_val)

# Convert 'Air Date' column's type to datetime
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [7]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

## Answer terms in question

In [8]:
def count_matches(row):
    '''Counts how many words in the answer also occurs in the question
    '''
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(count_matches, axis=1)
print(jeopardy['answer_in_question'].mean())

0.0590019652498


The average number of words that occurs in both the question and the answer is fairly low (about 6%), which means it will be difficult to deduce an answer from the question. Studying is therefore recommended.

## Question overlap

In [9]:
question_overlap = []
terms_used = set()

for _, row in jeopardy.iterrows():
    split_question = row['clean_question'].split()
    split_question = [w for w in split_question if len(w) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        else:
            terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.692596005734


There is a high percentage (about 70%) of overlapping between terms used in old questions and terms in newer questions. However, since we're only looking with a small set of data and not looking at phrases, only term, this make the result less significant.

More of less, looking at old questions bank is still recommended.

# Chi-square

In [13]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [11]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for _, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

print(observed_expected)

[(0, 1), (1, 0), (0, 1), (0, 1), (1, 0)]


In [14]:
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

## Findings

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.