In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")

#print(jeopardy[:5])
#print(jeopardy.columns)


In [2]:
jeopardy.columns = [c[1:] if c[0]==" " else c for c in jeopardy.columns]

In [3]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

In [4]:
import re

def normalize_text(s):
    s = s.lower()
    s = re.sub("[^A-Za-z0-9\s]", "", s)
    return s

def normalize_dollar_vals(col):
    val = re.sub("[^A-Za-z0-9\s]", "", col)
    try:
        val = int(val)
    except Exception:
        val = 0
    return val

In [5]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_dollar_vals)

In [6]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [7]:
def split_cols(row):
    split_question = row['clean_question'].split(" ")
    split_answer = row['clean_answer'].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for a in split_answer:
        if a in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(split_cols, axis=1)

In [8]:
jeopardy['answer_in_question'].mean()

0.060493257069335872

# Mean of Answers in Questions
I wouldn't spend my time studying answers in questions, as this occurs only 6% of the time based on the mean.

In [9]:
jeopardy.sort_values('Air Date', inplace=True)
terms_used = set()
question_overlap = []
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    split_question = [s for s in split_question if len(s) > 5]
    match_count=0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap        

In [10]:
jeopardy['question_overlap'].mean()

0.68762605921698017

# Question Overlap
Interestingly, there's close to 70% of overlap in words between questions, though this is not a complete data set. Also, only looking at words themselves have no meaning without context in sentences/phrases.

In [11]:
def hi_lo_val(row):
    value = 0
    if row['clean_value'] > 800:
        value=1
    return value

jeopardy['high_value'] = jeopardy.apply(hi_lo_val, axis=1)

In [12]:
def hi_lo_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row['clean_question'].split(" "):
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(hi_lo_count(term))

In [13]:
observed_expected

[(0, 1), (6, 11), (1, 1), (1, 0), (0, 1)]

In [16]:
high_value_count = len([v for v in jeopardy['high_value'] if v == 1])
low_value_count = len([v for v in jeopardy['high_value'] if v == 0])

In [22]:
from scipy.stats import chisquare
import numpy as np
chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    exp_term_count_high = total_prop * high_value_count
    exp_term_count_low = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([exp_term_count_high, exp_term_count_low])
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.36458944708219171, pvalue=0.54596835640997887),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]