In [39]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#Reading the dataset 
jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value', ' Question', ' Answer'], dtype='object')


In [40]:
fixed_columns = ['Show Number','Air Date','Round','Category','Value','Question','Answer']
jeopardy.columns= fixed_columns
print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer'], dtype='object')


In [41]:
#Creating a function that normalizes text
import re
def normalize_text(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    return string

In [42]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

In [43]:
#Creating a function that normalizes the VALUE column 
def normalize_values(string):
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    try:
        string = int(string)
    except Exception:
        string = 0
    return string

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_values)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [44]:
#Creating a function to determine general trends
def find_prediction(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    the = 'the'
    if "the" in split_answer:
        split_answer.remove("the")
    if(len(split_answer)==0):
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
        else:
            pass
    return(match_count/len(split_answer))

answer_in_question = jeopardy.apply(find_prediction,axis=1)

In [45]:
jeopardy['answer_in_question'] = answer_in_question
mean = jeopardy['answer_in_question'].mean()
print(mean)

0.0604932570693


From the mean value given above, we can see that only 6% of the time we can deduce the answer from the question. The value is very low to be used as a method for predicting the answer for Jeopardy.

In [46]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question_updated = []
    for item in split_question:
        if(len(item)>5):
            split_question_updated.append(item)
        else:
            pass
    match_count = 0
    for item in split_question_updated:
        if item in terms_used:
            match_count += 1
    for item in split_question_updated:
        terms_used.add(item)
    if(len(split_question_updated)>0):
        match_count = match_count / len(split_question_updated)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.690873731567


From the abover value, we can see that about 70% of the questions are repeated from the older ones, although this might be a little exaggerated since the dataset that we are operating on is only 10% of the original full Jeopardy question dataset.

In [47]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [48]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 4), (0, 1), (0, 1), (0, 2), (1, 0)]

In [55]:
import numpy as np
from scipy.stats import chisquare
high_value_1_count = 0
high_value_0_count = 0
high_value_1_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
high_value_0_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []
for row in observed_expected:
    total = sum(row)
    total_prop = total / jeopardy.shape[0]
    exp_high_value = total_prop * high_value_1_count
    exp_low_value = total_prop * high_value_0_count
    observed_values = np.array([row[0],row[1]])
    expected_values = np.array([exp_high_value,exp_low_value])
    chi_squared.append(chisquare(observed_values,expected_values))
    
print(chi_squared)

[(1.607851384507536, 0.20479409439225948), (0.40196284612688399, 0.52607729857054686), (0.40196284612688399, 0.52607729857054686), (0.80392569225376798, 0.36992223780795708), (2.4877921171956752, 0.11473257634454047)]


As can be seen from the chi-squared test done above, none of the p-values are less than 0.05, meaning that we can easily say that our question in deducing that studying questions that pertain to high value questions instead of low value questions will help us to earn more money is not true and also that we need to look at other values too.