# Jeopardy  (TV show) questions 

In [1]:
import pandas as pd
import numpy as np
jeopardy = pd.read_csv('jeopardy.csv')

In [2]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
# removes blanks in front of each column name
jeopardy.columns.str.strip()

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
jeopardy.rename(dict(zip(jeopardy.columns, jeopardy.columns.str.strip())), axis=1, inplace=True)

In [6]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
# normalize questions and answers, first checkout the original questions and answers
print(jeopardy['Question'][0])
print(jeopardy['Answer'][0])

For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory
Copernicus


In [8]:
# define a function to edit the text strings and apply on Question and Answer columns
import string
def clean_str(str):
    str_lower = str.lower()
    for c in string.punctuation:
        str_lower = str_lower.replace(c, '')
    return str_lower


In [9]:
jeopardy['clean_question'] = jeopardy['Question'].apply(clean_str)


In [10]:
jeopardy['clean_question'][0]

'for the last 8 years of his life galileo was under house arrest for espousing this mans theory'

In [11]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(clean_str)
print(jeopardy['clean_answer'][0])

copernicus


In [13]:
jeopardy.head(3)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona


In [16]:
## normalize value column by removing the $ sign
jeopardy['Value'].unique()

array(['$200', '$400', '$600', '$800', '$2,000', '$1000', '$1200',
       '$1600', '$2000', '$3,200', 'None', '$5,000', '$100', '$300',
       '$500', '$1,000', '$1,500', '$1,200', '$4,800', '$1,800', '$1,100',
       '$2,200', '$3,400', '$3,000', '$4,000', '$1,600', '$6,800',
       '$1,900', '$3,100', '$700', '$1,400', '$2,800', '$8,000', '$6,000',
       '$2,400', '$12,000', '$3,800', '$2,500', '$6,200', '$10,000',
       '$7,000', '$1,492', '$7,400', '$1,300', '$7,200', '$2,600',
       '$3,300', '$5,400', '$4,500', '$2,100', '$900', '$3,600', '$2,127',
       '$367', '$4,400', '$3,500', '$2,900', '$3,900', '$4,100', '$4,600',
       '$10,800', '$2,300', '$5,600', '$1,111', '$8,200', '$5,800',
       '$750', '$7,500', '$1,700', '$9,000', '$6,100', '$1,020', '$4,700',
       '$2,021', '$5,200', '$3,389'], dtype=object)

In [34]:
def clean_val(inp):
    inp_nopunc = inp
    try:
        for c in string.punctuation:
            inp_nopunc = inp_nopunc.replace(c, '')
        return int(inp_nopunc)
    except Exception:
        return 0


In [35]:
jeopardy['clean_value'] = jeopardy['Value'].apply(clean_val)

In [36]:
jeopardy['clean_value'].unique()

array([  200,   400,   600,   800,  2000,  1000,  1200,  1600,  3200,
           0,  5000,   100,   300,   500,  1500,  4800,  1800,  1100,
        2200,  3400,  3000,  4000,  6800,  1900,  3100,   700,  1400,
        2800,  8000,  6000,  2400, 12000,  3800,  2500,  6200, 10000,
        7000,  1492,  7400,  1300,  7200,  2600,  3300,  5400,  4500,
        2100,   900,  3600,  2127,   367,  4400,  3500,  2900,  3900,
        4100,  4600, 10800,  2300,  5600,  1111,  8200,  5800,   750,
        7500,  1700,  9000,  6100,  1020,  4700,  2021,  5200,  3389])

In [37]:
jeopardy['Air Date']

0        2004-12-31
1        2004-12-31
2        2004-12-31
3        2004-12-31
4        2004-12-31
5        2004-12-31
6        2004-12-31
7        2004-12-31
8        2004-12-31
9        2004-12-31
10       2004-12-31
11       2004-12-31
12       2004-12-31
13       2004-12-31
14       2004-12-31
15       2004-12-31
16       2004-12-31
17       2004-12-31
18       2004-12-31
19       2004-12-31
20       2004-12-31
21       2004-12-31
22       2004-12-31
23       2004-12-31
24       2004-12-31
25       2004-12-31
26       2004-12-31
27       2004-12-31
28       2004-12-31
29       2004-12-31
            ...    
19969    2009-05-14
19970    2009-05-14
19971    2009-05-14
19972    2009-05-14
19973    2009-05-14
19974    2009-05-14
19975    2009-05-14
19976    2009-05-14
19977    2009-05-14
19978    2009-05-14
19979    2009-05-14
19980    2009-05-14
19981    2009-05-14
19982    2009-05-14
19983    2009-05-14
19984    2009-05-14
19985    2009-05-14
19986    2009-05-14
19987    2009-05-14


In [40]:
# convert air date to a datetime column
jeopardy['date'] = pd.to_datetime(jeopardy['Air Date'])

In [43]:
jeopardy_clean = jeopardy.copy().drop(['Air Date', 'Value','Question','Answer'],axis=1)

In [44]:
jeopardy_clean.head()

Unnamed: 0,Show Number,Round,Category,clean_question,clean_answer,clean_value,date
0,4680,Jeopardy!,HISTORY,for the last 8 years of his life galileo was u...,copernicus,200,2004-12-31
1,4680,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,2004-12-31
2,4680,Jeopardy!,EVERYBODY TALKS ABOUT IT...,the city of yuma in this state has a record av...,arizona,200,2004-12-31
3,4680,Jeopardy!,THE COMPANY LINE,in 1963 live on the art linkletter show this c...,mcdonalds,200,2004-12-31
4,4680,Jeopardy!,EPITAPHS & TRIBUTES,signer of the dec of indep framer of the const...,john adams,200,2004-12-31


### We have cleaned the data and saved it to another DataFrame jeopardy_clean

## How often the answer is deducible from the question, i.e. the answer is already in the question? 

In [49]:
# define a function that calculates how many times each word in answer occurs in question
def count_answer_in_question(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    else:
        for word in split_answer:
            if word in split_question:
                match_count += 1
        return match_count /len(split_answer)

    

In [50]:
count_answer_in_question(jeopardy_clean.loc[0])

0.0

In [53]:
jeopardy_clean['answer_in_question'] = jeopardy_clean.apply(count_answer_in_question, axis=1)

In [56]:
print(jeopardy_clean['answer_in_question'].mean())
print(jeopardy_clean['answer_in_question'].median())

0.06035277385469894
0.0


### Although typically the answer is not in the question (median count = 0), the average count of answer appearing in question is 0.06

## How many questions are repeated from old question? Find the key words of each question and count the occurrance. 

In [77]:
question_overlap = []
terms_used = set()
for index, row in jeopardy_clean.iterrows():
    split_question = row['clean_question'].split(' ')
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
#        print(row)
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

jeopardy_clean['question_overlap']  = question_overlap
print(jeopardy_clean['question_overlap'].mean())
        



0.8035657890528735


### The rate of repeating questions is quite high. 

## Focus on high-value questions

In [80]:
def high_val(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy_clean['high_value'] = jeopardy_clean.apply(high_val, axis=1)

In [128]:
def high_low(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy_clean.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count


        

In [122]:
comparison_terms = list(terms_used)

In [123]:
comparison_terms.remove('')

In [124]:
comparison_terms[:5]

['seltzer',
 'threepenny',
 'pa',
 'continue',
 'nairobi',
 'unknowns',
 'hungary',
 'quindlen',
 'ecological',
 'presbyterian',
 'einheriar',
 'amar',
 '4inch',
 'wolfhound',
 'annuus',
 'budd',
 'winkies',
 'afghanistan',
 'lydians',
 'lords',
 'ebenezer',
 'plaza',
 'had',
 'jersey',
 'hrefhttpwwwjarchivecommedia20070719dj02ajpg',
 'final',
 'dillon',
 'kettles',
 'delite',
 'operators',
 'smelter',
 'aureum',
 'irrigated',
 '1815',
 'charlaine',
 'childlike',
 'costar',
 'hearing',
 'everywhere',
 'nursery',
 'oscarsa',
 'neigh',
 'norwest',
 'harrys',
 'galilee',
 'jerzy',
 'single',
 'sewards',
 'specifically',
 'lookst',
 'maid',
 'speciale',
 'struggle',
 'front',
 'khrushchev',
 'morial',
 'vertebral',
 'oesterreich',
 'waterpower',
 'reordering',
 'krakauer',
 'macdui',
 'thisand',
 'these',
 'pajama',
 'spellings',
 'hollyhock',
 'amherst',
 'drilling',
 'ross',
 'strangers',
 'professionals',
 'chinos',
 'lowtemperature',
 'addams',
 'spouse',
 'buddy',
 'wide',
 'walla',
 

In [130]:
observed_expected = []
for word in comparison_terms[:5]:
    observed_expected.append(high_low(word))

In [131]:
observed_expected



[(0, 1), (1, 0), (2, 4), (0, 2), (1, 3)]

In [132]:
high_value_count = jeopardy_clean[jeopardy_clean['high_value']==1].shape[0]
low_value_count = jeopardy_clean[jeopardy_clean['high_value']==0].shape[0]

In [136]:
chi_squared = []
from scipy.stats import chisquare
#from scipy.stats import chi2_contigency

for ls in observed_expected:
    total = np.sum(ls)
    total_prop = total / jeopardy_clean.shape[0]
    high_exp = high_value_count * total_prop
    low_exp = low_value_count * total_prop

    chi_squared.append(chisquare(ls, [high_exp, low_exp]))
    

In [137]:
chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.06376233446880725, pvalue=0.8006453026878781),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.02636443308440769, pvalue=0.871013484688921)]

### The first 5 terms have big p values, meaning their reoccurrances are statistically insignificant. 