# Analysis and chi-squared test on jeopardy questions data

In [147]:
import pandas as pd
import string
import random
from scipy.stats import chisquare

In [154]:
jeopardy = pd.read_csv('./data/jeopardy.csv')

In [155]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [12]:
jeopardy.columns = jeopardy.columns.str.strip() #some of the column names had spaces in front

## Normalizing the text columns

In [62]:
def normalize(text):
    text = str(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [63]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)

## Normalizing the value column

In [70]:
def normalize_2(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    try:
        output = int(text)
    except ValueError:
        output = 0
    return output

In [72]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_2)

## Converting date column to datetime format

In [73]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

---

## How often the answer is deducible from the question?

In [92]:
def get_match_counts(row):
    
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    
    try:
        while True:
            split_answer.remove('the')
    except ValueError:
        pass
    
    if len(split_answer) == 0:
        output = 0
    else:
        for item in split_answer:
            if item in split_question:
                match_count +=1
        output = match_count/len(split_answer)
        
    return output

In [93]:
jeopardy['answer_in_question'] = jeopardy.apply(get_match_counts, axis=1)

In [94]:
jeopardy['answer_in_question'].mean()

0.05876463943179808

The mean value above tells us that on average 5% of the words in the answer already occured in the question. This is a low percentage and means that a good studying strategy should not include the question that have to be answered.  

## How often new questions are repeats of older ones?

In [117]:
question_overlap = []
terms_used = []
terms_used_set = set(terms_used)

jeopardy.sort_values(by='Air Date', inplace=True)

for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    
    for item in split_question:
        if len(item) < 6:
            split_question.remove(item)
            
    match_count = 0
    
    for item in split_question:
        terms_used_set.add(item)
        if item in terms_used_set:
            match_count +=1
            
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
        
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap

In [119]:
jeopardy['question_overlap'].mean()

0.9976444014198128

The mean value above means that on average, 99% of the words in a given question already occured in previous question.

## Low value vs. high value questions & applying chi-squared test on some random words

In [120]:
def high_value(row):
    if row['clean_value']>800:
        value = 1
    else:
        value = 0
    return value

In [121]:
jeopardy['high_value'] = jeopardy.apply(high_value, axis=1)

In [122]:
def high_count_low_count(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_clean_question = row['clean_question'].split(' ')
        if word in split_clean_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [136]:
comparison_terms = random.sample(list(terms_used_set),10)
comparison_terms

['bags',
 'takers',
 'borsalino',
 'hrefhttpwwwjarchivecommedia20050209dj07jpg',
 'cares',
 'khlestakov',
 'wasn’t',
 'byssus',
 'neverbeforeseen',
 'caudal']

In [137]:
observed_expected = []
for item in comparison_terms:
    observed_expected.append(high_count_low_count(item))

In [151]:
for term, observed in zip(comparison_terms, observed_expected):
    print (term,observed)

bags (10, 57)
takers (1, 2)
borsalino (1, 0)
hrefhttpwwwjarchivecommedia20050209dj07jpg (0, 1)
cares (3, 9)
khlestakov (1, 0)
wasn’t (1, 3)
byssus (0, 1)
neverbeforeseen (0, 1)
caudal (0, 4)


In [140]:
high_value_count = len(jeopardy[jeopardy['high_value'] == 1])
low_value_count = len(jeopardy[jeopardy['high_value'] == 0])

In [143]:
high_value_count, low_value_count

(61422, 155508)

In [150]:
chi_squared = []
total_rows = len(jeopardy)
for item in observed_expected:
    observed = list(item)
    total = sum(item)
    total_prop = total/total_rows
    
    expected_high_value_count = total_prop*high_value_count
    expected_low_value_count = total_prop*low_value_count
    expected = [expected_high_value_count, expected_low_value_count]
    print(observed,expected)
    
    chi_squared.append(chisquare(observed, expected))

[10, 57] [18.970515834601024, 48.02948416539898]
[1, 2] [0.8494260821463144, 2.1505739178536856]
[1, 0] [0.2831420273821048, 0.7168579726178952]
[0, 1] [0.2831420273821048, 0.7168579726178952]
[3, 9] [3.3977043285852577, 8.602295671414742]
[1, 0] [0.2831420273821048, 0.7168579726178952]
[1, 3] [1.1325681095284192, 2.8674318904715808]
[0, 1] [0.2831420273821048, 0.7168579726178952]
[0, 1] [0.2831420273821048, 0.7168579726178952]
[0, 4] [1.1325681095284192, 2.8674318904715808]


In [149]:
chi_squared

[Power_divergenceResult(statistic=5.9172861905602865, pvalue=0.014993034601508873),
 Power_divergenceResult(statistic=0.03723409388907139, pvalue=0.846989214486915),
 Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751),
 Power_divergenceResult(statistic=0.3949764642333513, pvalue=0.5296950912486695),
 Power_divergenceResult(statistic=0.06493845212547802, pvalue=0.7988542166146058),
 Power_divergenceResult(statistic=2.5317964247338085, pvalue=0.11157312838169751),
 Power_divergenceResult(statistic=0.021646150708492677, pvalue=0.8830323245068887),
 Power_divergenceResult(statistic=0.3949764642333513, pvalue=0.5296950912486695),
 Power_divergenceResult(statistic=0.3949764642333513, pvalue=0.5296950912486695),
 Power_divergenceResult(statistic=1.5799058569334052, pvalue=0.2087742545638461)]

For the first term "bag" (that occured 10 times in high value questions and 57 times in low value questions) we have a significant difference between expected and observed values. This is confirmed by the the corresponding pvalue of 0.014 which means that there is a 1,4% chance that this difference is due to pure chance.