In [435]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
Show Number    19999 non-null int64
 Air Date      19999 non-null object
 Round         19999 non-null object
 Category      19999 non-null object
 Value         19999 non-null object
 Question      19999 non-null object
 Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [436]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [437]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [438]:
names = []
for each in jeopardy.columns:
    names.append(each.replace(" ", ""))

jeopardy.columns = names

In [439]:
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [440]:
import re

def norma(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    string = re.sub("\s+", " ", string)
    return string

def norma_dollar(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    string = re.sub("\s+", " ", string)
    try:
        string = int(string)
    except Exception:
        string = 0
    return string

In [441]:
jeopardy['clean_question'] = jeopardy['Question'].apply(norma)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(norma)

In [442]:
jeopardy['clean_answer'].head(20)

0              copernicus
1              jim thorpe
2                 arizona
3               mcdonalds
4              john adams
5                 the ant
6          the appian way
7          michael jordan
8              washington
9            crate barrel
10         jackie gleason
11                the cud
12    ceylon or sri lanka
13              jim brown
14           the uv index
15                 bulova
16            jesse james
17                    imp
18      the international
19             lou gehrig
Name: clean_answer, dtype: object

In [443]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norma_dollar)

In [444]:
jeopardy["AirDate"] = pd.to_datetime(jeopardy["AirDate"])


In [445]:
def count_matches(row):
    split_answer = row["clean_answer"].split()
    split_question = row["clean_question"].split()
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

In [446]:
jeopardy["answer_in_question"].mean()

0.05900196524977763

In [447]:
question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values('AirDate')

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for each in split_question:
        if each in terms_used:
            match_count +=1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count / len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
question_overla_mean = jeopardy["question_overlap"].mean()
print(question_overla_mean)

3.0658032901645083


In [448]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [449]:
def count_word(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row['high_value'] == 1:
                high_count +=1
            else:
                low_count +=1
    return high_count, low_count
                

In [450]:
from random import choice

terms_used_list = list(terms_used)
comparison_terms = [choice(terms_used_list) for _ in range(10)]

observed_expected = []

for term in comparison_terms:
    observed_expected.append(count_word(term))

observed_expected

[(0, 1),
 (0, 1),
 (2, 6),
 (0, 4),
 (0, 4),
 (0, 1),
 (0, 1),
 (0, 2),
 (0, 1),
 (5, 7)]

In [451]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]



chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.05272886616881538, pvalue=0.818381104912348),
 Power_divergenceResult(statistic=1.607851384507536, pvalue=0.20479409439225948),
 Power_divergenceResult(statistic=1.607851384507536, pvalue=0.20479409439225948),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.9909151991757656, pvalue=0.31951879465803057)]