In [1]:
import pandas as pd
import csv

jeopardy = pd.read_csv('jeopardy.csv')
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
...,...,...,...,...,...,...,...
1995,3461,1999-09-27,Jeopardy!,MORTAL MATTERS,$400,This saint's remains were in a box atop a ward...,Saint Valentine
1996,3461,1999-09-27,Jeopardy!,BIRDS,$400,"In captivitiy, these wading birds are fed caro...",Flamingo
1997,3461,1999-09-27,Jeopardy!,AUTHORS' RHYME TIME,$400,Anne's bad habits,Rice's vices
1998,3461,1999-09-27,Jeopardy!,WARNER BROS.,$500,"He outlasted his brothers Sam, Albert & Harry ...",Jack Warner


In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns = jeopardy.columns.str.strip()
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [4]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    text = re.sub("\s+", " ", text)
    return text

def normalize_values(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [5]:
jeopardy['clean_question'] = jeopardy.Question.apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_values)

In [6]:
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
...,...,...,...,...,...,...,...,...,...,...
1995,3461,1999-09-27,Jeopardy!,MORTAL MATTERS,$400,This saint's remains were in a box atop a ward...,Saint Valentine,this saints remains were in a box atop a wardr...,saint valentine,400
1996,3461,1999-09-27,Jeopardy!,BIRDS,$400,"In captivitiy, these wading birds are fed caro...",Flamingo,in captivitiy these wading birds are fed carot...,flamingo,400
1997,3461,1999-09-27,Jeopardy!,AUTHORS' RHYME TIME,$400,Anne's bad habits,Rice's vices,annes bad habits,rices vices,400
1998,3461,1999-09-27,Jeopardy!,WARNER BROS.,$500,"He outlasted his brothers Sam, Albert & Harry ...",Jack Warner,he outlasted his brothers sam albert harry in ...,jack warner,500


In [7]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [9]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [10]:
def count_matches(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(count_matches, axis = 1)

In [11]:
jeopardy.answer_in_question.mean()

0.05665595238095238

# Answer terms in the question

The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer.

In [12]:
question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values('Air Date')

for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap

jeopardy.question_overlap.mean()

0.3855813172938168

## Question overlap

There is about 40% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling the questions.

In [13]:
def determine_value(row):
    value = 0
    if row['clean_value'] > 800:
        return 1
    return 0

jeopardy['high_value'] = jeopardy.apply(determine_value, axis = 1)

In [14]:
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap,high_value
1138,1279,1990-03-08,Jeopardy!,THE MIDDLE AGES,$300,It's estimated this dread 14th century epidemi...,Black Death,its estimated this dread 14th century epidemic...,black death,300,0.0,0.000000,0
1154,1279,1990-03-08,Jeopardy!,SPORTS EQUIPMENT,$500,This apparatus used in women's gymnastics is a...,Balance Beam,this apparatus used in womens gymnastics is ab...,balance beam,500,0.0,0.000000,0
1153,1279,1990-03-08,Jeopardy!,AUSTRALIA,$500,This flightless bird is featured on Australia'...,Emu,this flightless bird is featured on australias...,emu,500,0.0,0.000000,0
1152,1279,1990-03-08,Jeopardy!,JEWELRY,$500,Tahiti & French Polynesia are famous for pearl...,Black,tahiti french polynesia are famous for pearls ...,black,500,0.0,0.000000,0
1151,1279,1990-03-08,Jeopardy!,MANIAS,$500,"From the Greek for ""great"", it's the delusion ...",Megalomania,from the greek for great its the delusion of w...,megalomania,500,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962,6294,2012-01-19,Double Jeopardy!,AMERICAN HISTORY,$1600,His foes said that in 1877 he agreed to withdr...,(Rutherford B.) Hayes,his foes said that in 1877 he agreed to withdr...,rutherford b hayes,1600,0.0,0.625000,1
1963,6294,2012-01-19,Double Jeopardy!,WHAT'S YOUR BEEF?,$1600,The second word in the French name of this bon...,filet mignon,the second word in the french name of this bon...,filet mignon,1600,0.0,0.500000,1
1964,6294,2012-01-19,Double Jeopardy!,WEAPONS OF WORLD WAR II,$1600,It was the alphanumeric designation of the U.S...,the M1,it was the alphanumeric designation of the us ...,the m1,1600,0.0,0.333333,1
1950,6294,2012-01-19,Double Jeopardy!,4 N,$400,"Number of ""beers on the wall"" at the beginning...",ninety-nine,number of beers on the wall at the beginning o...,ninetynine,400,0.0,1.000000,0


In [15]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row['clean_question'].split(' '):
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [17]:
import random

terms_used_list = list(terms_used)
comparison_terms = [random.choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_usage(term))
    
observed_expected

[(0, 1),
 (1, 1),
 (1, 0),
 (0, 1),
 (0, 1),
 (1, 0),
 (0, 1),
 (1, 0),
 (0, 2),
 (0, 5)]

In [19]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy.high_value == 1].shape[0]
low_value_count = jeopardy[jeopardy.high_value == 0].shape[0]

chi_squared = []
    
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=0.38504155124653744, pvalue=0.5349173571192949),
 Power_divergenceResult(statistic=0.49108192670240525, pvalue=0.48344507831370875),
 Power_divergenceResult(statistic=2.597122302158273, pvalue=0.1070579459659198),
 Power_divergenceResult(statistic=0.38504155124653744, pvalue=0.5349173571192949),
 Power_divergenceResult(statistic=0.38504155124653744, pvalue=0.5349173571192949),
 Power_divergenceResult(statistic=2.597122302158273, pvalue=0.1070579459659198),
 Power_divergenceResult(statistic=0.38504155124653744, pvalue=0.5349173571192949),
 Power_divergenceResult(statistic=2.597122302158273, pvalue=0.1070579459659198),
 Power_divergenceResult(statistic=0.7700831024930749, pvalue=0.38019134513275776),
 Power_divergenceResult(statistic=1.9252077562326873, pvalue=0.16528369542875193)]

## Chi-squared results

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't valid. It would be better to run this test with only terms that have high frequencies.