# Jeopardy Questions

In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
 Air Date      19999 non-null object
 Round         19999 non-null object
 Category      19999 non-null object
 Value         19999 non-null object
 Question      19999 non-null object
 Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [11]:
jeopardy.columns #need to fix the spacing for the columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [12]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [14]:
jeopardy["Category"].value_counts()

TELEVISION                          51
U.S. GEOGRAPHY                      50
LITERATURE                          45
AMERICAN HISTORY                    40
HISTORY                             40
BEFORE & AFTER                      40
AUTHORS                             39
WORD ORIGINS                        38
WORLD CAPITALS                      37
SPORTS                              36
BODIES OF WATER                     36
SCIENCE                             35
RHYME TIME                          35
SCIENCE & NATURE                    35
MAGAZINES                           35
WORLD GEOGRAPHY                     33
ANNUAL EVENTS                       32
HISTORIC NAMES                      32
WORLD HISTORY                       32
IN THE DICTIONARY                   31
BIRDS                               31
FICTIONAL CHARACTERS                31
POTPOURRI                           30
ISLANDS                             30
MEDICINE                            30
U.S. PRESIDENTS          

# Normalizing Text

* convert to lowercase
* remove all punctuation

In [33]:
import string
import re
#function to normalize string
def normalize(val):
    val = val.lower()
    val = re.sub('['+string.punctuation+']', '', val)
    return(val)

In [35]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)

jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [36]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


# Normalizing Columns

In [38]:
def normalize_values(value):
    value = re.sub('['+string.punctuation+']', '', value)
    try:
        value = int(value)
    except Exception:
        value = 0
    return value

In [40]:
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_values)

In [43]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [44]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
Show Number       19999 non-null int64
Air Date          19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


# Answers in Questions ?

In [59]:
test = jeopardy.loc[0]["clean_question"].split(" ")

In [60]:
test.remove("the")
test

['for',
 'last',
 '8',
 'years',
 'of',
 'his',
 'life',
 'galileo',
 'was',
 'under',
 'house',
 'arrest',
 'for',
 'espousing',
 'this',
 'mans',
 'theory']

In [61]:
def answerinquestion(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)


In [64]:
jeopardy["answer_in_question"] = jeopardy.apply(answerinquestion, axis = 1)

In [65]:
jeopardy["answer_in_question"].mean()

0.060352773854698942

## Answer terms in the question

The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

# Recycled Questions

In [70]:
question_overlap = []
terms_used = set()
#iterate over data frame rows as (index, series) pairs
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) >= 6]
    match_count = 0
    for p in split_question:
        if p in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

In [73]:
jeopardy["question_overlap"] = question_overlap

In [74]:
jeopardy["question_overlap"].mean()

0.69021171433935069

# Question overlap

There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

# Low Value vs High Value Questions

In [75]:
def determine_value(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value

In [78]:
jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [80]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1), (0, 1), (1, 0), (0, 1), (1, 0)]

# Applying The Chi-Squared Test

In [88]:
high_value_count = sum(jeopardy["high_value"] == 1)
low_value_count = sum(jeopardy["high_value"] == 0)
chi_squared = []
print(high_value_count, low_value_count)

5734 14265


In [86]:
from scipy.stats import chisquare
import numpy as np

In [90]:
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

# Chi-squared results
None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.