# Jeopardy 

<strong>This project looks at 20000 jeopardy questions, trying to determine if it's possible to extract any insights in terms of the best possible catagories to study.</strong>

# Initial Import and Cleanup

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

jeopardy = pd.read_csv('jeopardy.csv')

In [2]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
colnames = {}
for i,cn in enumerate(jeopardy.columns):
        colnames[cn]=cn.strip(' ')
jeopardy.rename(columns=colnames,inplace=True)
print(jeopardy.columns)
print()
jeopardy.info()

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
Air Date       19999 non-null object
Round          19999 non-null object
Category       19999 non-null object
Value          19999 non-null object
Question       19999 non-null object
Answer         19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [4]:
import re
def normalizer(s):
    """This function takes in an input string and then normalises it by converting it to lowercase
    and removing all the punctutation"""
    return re.sub(r'[^\w\s]','',s.lower())

print(normalizer("This is a test sentence, it's very long!"))

this is a test sentence its very long


In [5]:
#clean up the answer and question columns
jeopardy['clean_question'] = jeopardy['Question'].apply(lambda x:normalizer(x))
jeopardy['clean_answer'] = jeopardy['Answer'].apply(lambda x:normalizer(x))

In [6]:
#convert values to numeric
jeopardy['Value']=jeopardy['Value'].apply(lambda x: normalizer(x[1:]))
jeopardy['clean_value'] = jeopardy['Value'].replace({'one':1}).astype(int)

In [7]:
#convert air dates to datetime
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

# Analyse for trends

<strong>Does the answer ever occur in the question?</strong>

In [8]:
def wordrep(ir):
    """Function takes in an input row, splits the question and answer on whitespace and then looks to 
    see how often words from the answer are repeated in the question"""
    split_question = ir['clean_question'].split(' ')
    split_answer = ir['clean_answer'].split(' ')
    
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer)==0:
        return 0
    else:
        for e in split_answer:
            if e in split_question:
                match_count += 1
        return match_count/len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(lambda row: wordrep(row), axis=1)

In [9]:
jeopardy['answer_in_question'].describe()

count    19999.000000
mean         0.060493
std          0.167014
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: answer_in_question, dtype: float64

It's rare that the answer is a repeat of the question.

<strong>How often do questions repeat?</strong>

In [10]:
#sort by air date to put questions in chronological order
jeopardy.sort_values('Air Date',ascending=True,inplace=True)

In [49]:
question_overlap = []
terms_used = {}

for i,r in jeopardy.iterrows():
    split_question = r['clean_question'].split(' ')
    split_question = [e for e in split_question if len(e)>=6]
    match_count = 0
    for w in split_question:
        if w in terms_used.keys():
            terms_used[w] += 1
            match_count += 1
        else:
            terms_used[w] = 1
    if len(split_question)>0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)

jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()    

0.6894006357823182

There appears to be a high degree of repetition among the larger words used in the questions.  This is promising with regards to studying...  Now, look at the impact of question value on word occurence to get a weighted importance level...

In [50]:
#identify all questions of high value (>800)
jeopardy['high_value'] = (jeopardy['clean_value']>800)
jeopardy['high_value'].replace({False:0,True:1},inplace=True) 

In [51]:
def relativeWeight(w,df):
    """Determines how often a particular word repeats given the question's value
    returns a tuple with the number of occurences in low value questions and high value questions
    """
    low_count = 0
    high_count = 0
    for i,r in df.iterrows():
        split_question = r['clean_question'].split(' ')
        if w in split_question:
            if r['high_value']==1:
                high_count += 1
            else:
                low_count += 1
    return (low_count,high_count)

In [53]:
observed_expected = []
comparison_terms = []

for i in range(0,10):
    k = max(terms_used, key=terms_used.get)
    comparison_terms.append(k)
    terms_used.pop(k)

for t in comparison_terms:
    observed_expected.append(relativeWeight(t,jeopardy))
    
observed_expected

[(133, 108),
 (134, 73),
 (124, 57),
 (124, 55),
 (134, 42),
 (108, 55),
 (119, 50),
 (110, 54),
 (108, 52),
 (102, 58)]

In [62]:
high_value_count = jeopardy['high_value'].sum()
low_value_count = jeopardy['high_value'].count() - high_value_count

chi_squared = []
cnt = 0 
for l,h in observed_expected:
    total = l + h
    total_prop = total/(high_value_count+low_value_count)
    elow = low_value_count*total_prop
    ehigh = high_value_count*total_prop    
    chi2,pv = stats.chisquare([l,h],[elow,ehigh])
    chi_squared.append([comparison_terms[cnt],chi2,pv])
    cnt += 1

chi_squared

[['french', 30.70509560211122, 3.003751982853918e-08],
 ['island', 4.401396413478652, 0.03590951591318824],
 ['people', 0.7039630431807515, 0.4014552498370205],
 ['national', 0.36956355622281933, 0.5432422635312689],
 ['largest', 1.9892622715198827, 0.15841803672554888],
 ['little', 2.049483384913445, 0.15225784710260734],
 ['around', 0.06908968634314422, 0.7926668351740855],
 ['british', 1.4521478773041714, 0.22818361990918334],
 ['author', 1.1467782632567483, 0.28422458885200585],
 ['meaning', 4.4934633334396965, 0.03402468062121473]]

There seems to be a statistically significant increase in 3 of the top ten words:
French, Island, and Meaning.  In particular, France seems to be a good thing to study in order to high higher value questions.