# Guided Project: Winning Jeopardy 

The goal of this guided project is to explore data from Jeopardy questions. 

In [178]:
import pandas as pd

In [179]:
import re

## Exploring the Data

In [180]:
questions = pd.read_csv('Datasets/jeopardy.csv')

In [181]:
questions.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [182]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Show Number  19999 non-null  int64 
 1    Air Date    19999 non-null  object
 2    Round       19999 non-null  object
 3    Category    19999 non-null  object
 4    Value       19999 non-null  object
 5    Question    19999 non-null  object
 6    Answer      19999 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [183]:
questions.shape

(19999, 7)

## Cleaning the Data

In [184]:
questions.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [185]:
questions.columns = questions.columns.str.strip().str.lower().str.replace(' ','_')

In [186]:
questions.columns

Index(['show_number', 'air_date', 'round', 'category', 'value', 'question',
       'answer'],
      dtype='object')

In [187]:
def normalize(words):
    words = words.lower()
    words = re.sub("\W"," ", words)
    return words

In [188]:
questions['clean_answer'] = questions['answer'].apply(normalize)

In [189]:
questions['clean_question'] = questions['question'].apply(normalize)

In [190]:
def normalize_money(val):
    val = re.sub("\W","", val)
    val = re.sub("[A-Za-z]", "-", val)
    if '-' in val:
        return 0
    else:
        return int(val)

In [191]:
questions['clean_value'] = questions['value'].apply(normalize_money)

In [192]:
questions['air_date'] = pd.to_datetime(questions['air_date'])

In [193]:
questions.head()

Unnamed: 0,show_number,air_date,round,category,value,question,answer,clean_answer,clean_question,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,copernicus,for the last 8 years of his life galileo was ...,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,jim thorpe,no 2 1912 olympian football star at carlisl...,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,arizona,the city of yuma in this state has a record av...,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,mcdonald s,in 1963 live on the art linkletter show th...,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,john adams,signer of the dec of indep framer of the co...,200


In [194]:
def matcher(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0 
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0 
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [195]:
questions['answer_in_question'] = questions.apply(matcher, axis=1)

In [196]:
questions['answer_in_question'].mean()

0.06294645581984942

Here, we see that words in the question occur in the answer about 6% of the time. Consequently, it would be difficult to just guess the answer based on the question. 

## Investigating Question Repeats

In [197]:
question_overlap = []
terms_used = set()

In [198]:
questions = questions.sort_values('air_date')

In [199]:
for index, row in questions.iterrows(): 
    split_question = row['clean_question'].split()
    split_question = [word for word in split_question if len(word) > 5]
    match_count = 0 
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

In [200]:
questions['question_overlap'] = question_overlap

In [201]:
questions['question_overlap'].mean()

0.7197989717809659

Above, we see that around 72% of the words with 6 or more letters have appeared in multiple Jeopardy questions. However, the analysis above looked at each word individually, so it does not actually tell us if the repeat of words means that questions are actually repeating. 

## Comparing Low and High-Value Questions

In [202]:
def valuer(row):
    if row['clean_value'] > 800:
        return 1
    else: 
        return 0 

In [203]:
questions['high_value'] = questions.apply(valuer, axis=1)

In [222]:
def value_counter(word):
    low_count = 0
    high_count = 0
    test = []
    for index, row in questions.iterrows():
        if word in row['clean_question'].split():
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1 
    return high_count, low_count


In [243]:
import random

In [258]:
random.seed(1)
comparison_terms = random.sample(list(terms_used), 10)

In [260]:
observed_expected = []
for word in comparison_terms:
    values = value_counter(word)
    observed_expected.append(values)

In [261]:
observed_expected

[(1, 4),
 (0, 1),
 (0, 1),
 (1, 0),
 (0, 1),
 (1, 0),
 (0, 1),
 (0, 1),
 (11, 33),
 (0, 2)]

In [262]:
high_value_count = questions[ questions['high_value'] == 1].shape[0]

In [264]:
low_value_count = questions[ questions['high_value'] == 0].shape[0]

In [275]:
chi_squared = []
from scipy.stats import chisquare
for row in observed_expected:
    total = sum(row)
    total_prop = total / questions.shape[0]
    exp_high = high_value_count * total_prop
    exp_low = low_value_count * total_prop
    chi_val = chisquare([row[0], row[1]], [exp_high, exp_low])[1]
    chi_squared.append(chi_val)

In [276]:
chi_squared

[0.6680941623250602,
 0.5260772985705469,
 0.5260772985705469,
 0.11473257634454047,
 0.5260772985705469,
 0.11473257634454047,
 0.5260772985705469,
 0.5260772985705469,
 0.5902149141987725,
 0.3699222378079571]

Above, we see that none of the words we randomly selected occur at different rates in high and low-value questions at a statistically significant level (p-value < .05). 