In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import string

In [2]:
jeopardy=pd.read_csv("jeopardy.csv")

In [3]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [6]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [8]:
jeopardy.columns=[c.lstrip() for c in jeopardy.columns]

In [9]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

striped the spaces in front of the column names...

In [21]:
#function to convert all strings to lowercase and remove punctuation
def nor(s):
    s=s.lower()
    translator = str.maketrans('', '', string.punctuation)
    s=s.translate(translator)
    return s

In [25]:
#Clean question and answer column
jeopardy["clean_question"]=jeopardy["Question"].apply(nor)
jeopardy["clean_answer"]=jeopardy["Answer"].apply(nor)

In [26]:
#function to normalize dollar values. 
#strip dollar sign at the front, convert it to numeric
#return 0 if there is error
def stripdollar(s):
    translator = str.maketrans('', '', string.punctuation)
    s=s.translate(translator)
    try:
        return int(s)
    except:
        return 0

In [29]:
jeopardy["clean_value"]=jeopardy["Value"].apply(stripdollar)

In [30]:
#function to convert the time column to datetime objects
from dateutil import parser
def dt(d):
    return parser.parse(d)

In [32]:
jeopardy["Air Date"]=pd.to_datetime(jeopardy["Air Date"])

In [47]:
#
def match_QA(S):
    split_answer=S["clean_answer"].split(" ")
    split_question=S["clean_question"].split(" ")
    match_count=0
    try:
        split_answer=split_answer.remove("the")
    except:
        pass
    if split_answer is None:
        return 0
    for w in split_answer:
        if w in split_question:
            match_count+=1
    return match_count/len(split_answer)

In [48]:
jeopardy["answer_in_question"]=jeopardy.apply(match_QA,axis=1)

In [51]:
mean_AinQ=jeopardy["answer_in_question"].mean()
print(mean_AinQ)

0.048654748061


Only 4% of the words in the answers are also in the corresponding questions. It is therefore not practical to deduce the answer from the question. The game requires previous knowledge of what's going on, instead of logical deductions. 

In [56]:
question_overlap=[]
terms_used=set([])

for ind,r in jeopardy.iterrows():
    match_count=0
    split_question=r["clean_question"].split(" ")
    split_question=[w for w in split_question if len(w)>=6]
    if split_question:
        for w in split_question:
            if w in terms_used:
                match_count+=1
            else:
                terms_used.add(w)
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

In [57]:
jeopardy['question_overlap']=question_overlap

In [58]:
print(jeopardy['question_overlap'].mean())

0.69195779922


This 69% means there is possibly recycling of the questions.

In [61]:
#Perform X-square test for terms v.s. value

#1.Take each row in jeopardy, if value > 800, mark the "high_value" as 1
def v(R):
    if R["clean_value"]>800:
        return 1
    else:
        return 0
jeopardy["high_value"]=jeopardy.apply(v,axis=1)    

In [62]:
#2.generate high-value count and low-value count for each word

def highlow(w):
    low_count=0
    high_count=0
    for i,row in jeopardy.iterrows():
        if w in row['clean_question'].split(" "):
            if row["high_value"]==1:
                high_count+=1
            else:
                low_count+=1
    return high_count,low_count

In [63]:
#3. Sample 5 words and see their values

observed_expected={}
comparison_terms=list(terms_used)[0:5]
for w in comparison_terms:
    observed_expected[w]=highlow(w)

In [64]:
print(observed_expected)

{'newborn': (1, 0), 'knotts': (1, 0), 'beliefs': (0, 3), 'geronimo': (3, 4), 'quarantines': (0, 1)}


In [70]:
#4. Construct the Expected v.s. Observed Table

In [71]:
high_value_count=len([v for v in jeopardy["high_value"] if v==1])
low_value_count=len([v for v in jeopardy["high_value"] if v==0])
print(high_value_count,low_value_count)

5734 14265


In [74]:
total=0
for k,v in observed_expected.items():
    total+=v[0]
    total+=v[1]
print(total)
total_prop=total/jeopardy.shape[0]

13


In [79]:
Observed_high=0
Observed_low=0
for k,v in observed_expected.items():
    Observed_high+=v[0]
    Observed_low+=v[1]

In [80]:
from scipy.stats import chisquare
E=[total_prop*high_value_count,total_prop*low_value_count]
chisq,pvalue=chisquare([Observed_high,Observed_low],E)

In [81]:
print(chisq,pvalue)

0.609263467847 0.435065262343


None of them is significant. Also, the numbers are too small to use chi-square test, which usually requires ~100+ data points.