analyzing Jeopardy questions to identify patterns in the questions

dataset obtained contains 20000 Jeopardy questions

import libraries

In [10]:
import pandas as pd
import re
import scipy.stats
import numpy as np

In [4]:
jeopardy=pd.read_csv('jeopardy.csv')

explore dataset

In [6]:
print(jeopardy.head(5))
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

rename column names to remove spaces

In [9]:
new_columns=['show_number', 'air_date', 'round', 'category', 'value', 'question', 'answer']
jeopardy.columns=new_columns

normalize text columns

In [11]:
def normalize_text(input_string):
    input_string=input_string.lower()
    input_string=re.sub(r'[^A-Za-z0-9\s]', '', input_string)
    return input_string

In [16]:
jeopardy['clean_question'] = jeopardy['question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['answer'].apply(normalize_text)

normalize value columns

In [13]:
def normalize_value(input_string):
    input_string=re.sub("[^A-Za-z0-9\s]", "", input_string)
    try:
        input_string=int(input_string)
    except Exception:
        #default to 0 if not numeric
        input_string=0
    return input_string

In [15]:
jeopardy['clean_value'] = jeopardy['value'].apply(normalize_value)
jeopardy["air_date"] = pd.to_datetime(jeopardy["air_date"])

in order to figure out whether to study past questions, study general knowledge, or not study it all, it would be helpful to figure out two things:
-  How often the answer is deducible from the question.
-  How often new questions are repeats of older questions.

question 1: 

In [None]:
def count_matches(row):
    split_answer=row["clean_answer"].split(" ")
    split_question=row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count=0
    for item in split_answer:
        if item in split_question:
            match_count+=1
    return match_count / len(split_answer)

jeopardy["answer_in_question"]=jeopardy.apply(count_matches, axis=1)
print(jeopardy["answer_in_question"].mean())

question 2:

In [17]:
question_overlap=[]
terms_used=set()
for i, row in jeopardy.iterrows():
        split_question=row["clean_question"].split(" ")
        split_question=[q for q in split_question if len(q) > 5]
        match_count=0
        for word in split_question:
            if word in terms_used:
                match_count+=1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            match_count/=len(split_question)
        question_overlap.append(match_count)
jeopardy["question_overlap"]=question_overlap
print(jeopardy["question_overlap"].mean())

0.6908737315671962

identify high-value questions using chi-squared test - find the words with the biggest differences in usage between high and low value questions, by selecting the words with the highest associated chi-squared values

limiting the usage of the above to reduce time

In [18]:
def determine_value(row):
    value=0
    if row["clean_value"] > 800:
        value=1
    return value
jeopardy["high_value"]=jeopardy.apply(determine_value, axis=1)
print(jeopardy["high_value"])

In [19]:
def count_usage(term):
    low_count=0
    high_count=0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count+=1
            else:
                low_count+=1
    return high_count, low_count
comparison_terms=list(terms_used)[:5]
observed_expected=[]
for term in comparison_terms:
    observed_expected.append(count_usage(term))
print(observed_expected)

[(3, 10), (0, 1), (0, 1), (0, 2), (1, 1)]

In [None]:
high_value_count=jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count=jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared=[]
for obs in observed_expected:
    total=sum(obs)
    total_prop=total / jeopardy.shape[0]
    high_value_exp=total_prop * high_value_count
    low_value_exp=total_prop * low_value_count
    observed=np.array([obs[0], obs[1]])
    expected=np.array([high_value_exp, low_value_exp])
    chi_squared.append(scipy.stats.chisquare(observed, expected))
print(chi_squared)

no terms
none of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.