In [2]:
import pandas as pd
import string
import numpy as np

#increase default number of rows possible to display
pd.options.display.max_rows=100

In [3]:
jeopardy=pd.read_csv('jeopardy.csv')

In [4]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
jeopardy.rename(columns={' Air Date':'Air Date',' Round':'Round',\
' Category':'Category',' Value':'Value',' Question':'Question',\
' Answer':'Answer'},inplace=True)

In [7]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [8]:
exclude=set(string.punctuation)

def norm_string(s):
    s=s.lower()
    s=''.join(char for char in s if char not in exclude)
    return s

In [9]:
#remove punctuation/caps from target column using function: norm_string
jeopardy['clean_question']=jeopardy['Question'].apply(norm_string)
jeopardy['clean_answer']=jeopardy['Answer'].apply(norm_string)

In [10]:
def norm_dollar_string(s):
    s=s.replace('$','')
    s=s.replace(',','')
    try:
        s=int(s)
    except:
        s=0
    return s

In [11]:
#remove punctuation from 'Value' column and convert to integer
jeopardy['clean_value']=jeopardy['Value'].apply(norm_dollar_string)

In [12]:
jeopardy['Air Date']=pd.to_datetime(jeopardy['Air Date'])

In [13]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [14]:
# calc percentage of answer words in question
# Ex. In answer "John Adams", John is in question:
# =1/2 = 0.5
def count_matches(row):
    split_answer=row['clean_answer'].split(' ')
    split_question=row['clean_question'].split(' ')
    match_count=0
    #remove all 'the' from split_answer, not just first one
    split_answer=[item for item in split_answer if item != 'the']
    if len(split_answer)==0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count+=1
    return match_count/len(split_answer)        

In [15]:
#Use pd.Dataframe.apply() method
#apply count_matches() function to jeopardy dataframe row
#axis=1 applies function to each row
jeopardy['answer_in_question']=jeopardy.apply(count_matches,axis=1)

In [16]:
answer_in_q_mean=jeopardy['answer_in_question'].mean()
print(round(answer_in_q_mean,4))

0.0597


In [17]:
#About 6% of the time, the answer is in the question.
#Therefore, this should be practiced.

In [18]:
#Sort the dataframe by ascending Air Date
#Jeopardy questions are now in chronological order
jeopardy.sort_values(by='Air Date',inplace=True)

In [19]:
#Are longer words (over 5 characters) in questions repeated in
#an older question?  Character minimum excludes common words like
#the and than that don't reveal a lot about the question.

#df.iterrows() is a generator that returns index and row
#A generator is a function which returns an object on which 
#you can call next().  next() returns one value each time it
#is called.

question_overlap=[]
terms_used=set()

for index, row in jeopardy.iterrows():
    split_question=row['clean_question'].split(' ')
    #exclude words shorter than 6 characters
    split_question=[element for element in split_question if len(element)>5]
    match_count=0
    #tally words repeated in older terms_used
    #tally done BEFORE any current split_question terms are added
    for word in split_question:
        if word in terms_used:
            match_count+=1  
    #add unique words to terms_used set
    for word in split_question:
        terms_used.add(word)
    #percent of split_question words that overlap with older questions
    if len(split_question) > 0:
        match_count=match_count/len(split_question)
    question_overlap.append(match_count)           

jeopardy['question_overlap']=question_overlap
jeopardy['question_overlap'].mean()

0.687124288096678

In [20]:
# A large number of questions are recycled over ~30 years.  About 69%.

In [21]:
def tag_high_low_val(dfrow):
    if dfrow['clean_value']>800:
        val=1
    else:
        val=0
    return val

In [22]:
#greate high_value column (high=1; low=0)
jeopardy['high_value']=jeopardy.apply(tag_high_low_val,axis=1)

In [23]:
#Find frequency of word in high/low value questions
#Low value question is $0-$800.  High value question is 
#greater than $800.
def word_freq(word):
    low_count=0
    high_count=0
    #Determine whether comparison word exists in question.
    #If it exist, is it high or low value question?
    for index, row in jeopardy.iterrows():
        if word in row['clean_question'].split(' '):
            if row['high_value']==1:
                high_count+=1
            else:
                low_count+=1
    return high_count, low_count

observed_expected=[]
comparison_terms=list(terms_used)[0:5]

#fetch word frequency and append to list
for word in comparison_terms:
    observed_expected.append(word_freq(word))

print(comparison_terms)    
print(observed_expected)

['brynner', 'baseballcincinnati', 'austern', 'gaseous', '“purely']
[(1, 1), (0, 1), (0, 1), (1, 0), (0, 1)]


In [24]:
high_value_count=jeopardy[jeopardy['high_value']==1]
high_value_count=high_value_count.shape[0]

low_value_count=jeopardy[jeopardy['high_value']==0]
low_value_count=low_value_count.shape[0]

jeopardy_tot_rows=jeopardy.shape[0]

print("High value count: ",high_value_count)
print("Low value count: ",low_value_count)
print('Jeopardy total rows: ',jeopardy_tot_rows)
print('\n','We expect ~29% of words are high value and 71% low value')

High value count:  5734
Low value count:  14265
Jeopardy total rows:  19999

 We expect ~29% of words are high value and 71% low value


In [25]:
from scipy.stats import chisquare
chi_squared=[]

for item in observed_expected:
    total=sum(item)
    #word proportion out of all questions
    #Ex. proportion of 'displace' in all clean_questions
    total_prop=total/jeopardy_tot_rows
    exp_cnt_high=total_prop*high_value_count
    exp_cnt_low=total_prop*low_value_count
    
    observed=np.array([item[0],item[1]])
    expected=np.array([exp_cnt_high,exp_cnt_low])
    chisqr, pval=chisquare(observed,expected)
    chi_squared.append(chisquare(observed,expected))
    print('word chisqr: ',round(chisqr,3),' ','p-value',round(pval,3))

word chisqr:  0.445   p-value 0.505
word chisqr:  0.402   p-value 0.526
word chisqr:  0.402   p-value 0.526
word chisqr:  2.488   p-value 0.115
word chisqr:  0.402   p-value 0.526


In [26]:
#The p-values are greater than .05 so there no significance.
#No word had a significant usage difference between high value
#and low value questions.