In [2]:
import pandas as pd
import string
import numpy as np
import scipy.stats

In [3]:
#read the input file
jeopardy = pd.read_csv("jeopardy.csv")

In [4]:
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [7]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [8]:
# Define function to normalise the text (make lower case and remove punctuation)
def text_norm(x):
    lower = x.lower()
    # fromkeys will generate a dict with keys as punctuation and values as space
    # maketrans will create transition values for all
    # translate will replace all the characters with space based on the maketrans table
    n_lower = lower.translate(str.maketrans(dict.fromkeys(string.punctuation,'')))
    return(n_lower)

In [9]:
clean_question = jeopardy['Question'].apply(text_norm)
clean_answer = jeopardy['Answer'].apply(text_norm)

In [10]:
#Define function to normalise values, ie remove the dollor sign and convert to int and handle errors as zero
def value_norm(x):
    # Remove dollar sign
    temp = x.translate(str.maketrans(dict.fromkeys(string.punctuation,'')))
    # Convert to int
    try:
        temp = int(temp)
    except:
        temp = 0
    return(temp)

In [11]:
clean_value = jeopardy['Value'].apply(value_norm)

In [12]:
clean_Air_Date = pd.to_datetime(jeopardy['Air Date'])

In [13]:
#Create columns for clean values
jeopardy.insert(len(jeopardy.columns),'clean_question',clean_question,True)
jeopardy.insert(len(jeopardy.columns),'clean_answer',clean_answer,True)
jeopardy.insert(len(jeopardy.columns),'clean_value',clean_value,True)
jeopardy.insert(len(jeopardy.columns),'clean_Air_Date',clean_Air_Date,True)

In [14]:
#Calculate the % of words in answer that also appear in question
def tokenize(x):
    split_question = x['clean_question'].split(" ") 
    split_answer = x['clean_answer'].split(" ") 
    match_count = 0
    while 'the' in split_answer:
        split_answer.remove('the')
        
    if len(split_answer) == 0:
        return(0)
    else:
        for i in split_answer:
             if i in split_question:
                    match_count = match_count+1 
    
    return(match_count/len(split_answer))

In [15]:
answer_in_question = jeopardy.apply(tokenize,axis=1)
jeopardy.insert(len(jeopardy.columns),'answer_in_question',answer_in_question,True)

In [16]:
answer_in_question_mean = answer_in_question.mean()
answer_in_question_mean

0.05973712438535679

### So, we might say that, from the dataset, on an average there is a 5.9 percent chance that we can find the answer in the question as well....
### To see how often the questions are repeated, we might check the question for words with more than 6 characters and see how often these are used

In [17]:
#Sort the dataset datewise
jeopardy.sort_values(['clean_Air_Date'],axis = 0,ascending = True, inplace = True)

In [135]:
question_overlap = []
terms_used = set()
#Get each row in jeopardy
for index,row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    terms = [i for i in split_question if len(i)>5]
    match_count = 0
    #Count the number of terms that are not already available in terms_used
    for i in terms:
        if i in terms_used:
            match_count = match_count+1
    #Add all the terms in the new question to the terms_used
    #It is very important that you count the words per question. 
    #So add all the words to the terms_used after counting
    for i in terms:
        terms_used.add(i)
    #If there are cases with zero length, the number of new words added to the list should be zero
    if len(terms) > 0:
        temp = match_count/len(terms)
    else:
        temp = match_count = 0
    #Append the value for every question   
    question_overlap.append(temp)


jeopardy['question_overlap'] = question_overlap  
question_overlap_mean = jeopardy['question_overlap'].mean()
print('question_overlap_mean:',question_overlap_mean)

question_overlap_mean: 0.687124288096678


### From the question_overlap_mean, we can assume that 68.7% of the words  in every new question added are already available in the list of words used. 
### This might mean that the questions are repeated quite a lot. But we are not very sure, since we are only checking for individual words and not prases...

### Lets not try and identify some high value questions with the help of Chi Sqared Test

In [136]:
def value(x):
    value = 0 if x['clean_value']<800 else 1
    return (value)

In [137]:
high_value = jeopardy.apply(value,axis = 1)
high_value.size,len(jeopardy)

(19999, 19999)

In [138]:
jeopardy['high_value'] = high_value

In [139]:
# Function to count the high and low value questions for each word in the terms_used
def que_count(x):
    low_count = 0
    high_count = 0
    for index,row in jeopardy.iterrows():
        split_question = row['clean_question'].split(" ")
        if (x in split_question) and row['high_value'] == 1 :
            high_count = high_count+1
        if (x in split_question) and row['high_value'] == 0 :
            low_count = low_count+1
    return (high_count,low_count)

In [140]:
#picking 10 random terms from terms_used
terms_list = list (terms_used)
comparison_terms = []
for i in range(10):
    n = np.random.randint(0,len(terms_list))
    comparison_terms.append(terms_list[n])

In [141]:
comparison_terms

['vivien',
 'couple',
 'turgenevs',
 'stempel',
 'mutineers',
 'berries',
 'cambrai',
 'albondigas',
 'savonius',
 'nursery']

In [143]:
observed_expected = []
for i in comparison_terms:
    h,l = que_count(i)
    observed_expected.append([h,l])

In [161]:
#Observe the frequencies of occurances of words
observed_expected

[[0, 1],
 [5, 9],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 4],
 [0, 1],
 [1, 0],
 [0, 1],
 [2, 7]]

In [150]:
# Get the total percentages count the high value and low value questions
count = pd.crosstab(jeopardy['high_value'],columns = ['high_value'])
count

col_0,high_value
high_value,Unnamed: 1_level_1
0,11285
1,8714


In [152]:
low_value_count,high_value_count

(11285, 8714)

In [145]:
low_value_count = count['high_value'][0]
high_value_count = count['high_value'][1]

In [158]:
chi_squared = []

for i in observed_expected:
    #Get the total number of questions in which the term appears
    total = sum(i)
    #Get the % of questions containing the term w.r.t the total dataset
    total_prop = total/len(jeopardy)
    #Get the expected count of a particular term can be got by multiplying the propotion with the total count of high and low values questions respectively
    high_value_exp = total_prop* high_value_count
    low_value_exp = total_prop * low_value_count
    expected = [high_value_exp,low_value_exp]
    #use scipy.stats.chisqare to calculate the observed/expected values
    chi_squared.append(scipy.stats.chisquare(i,expected))

In [159]:
chi_squared

[Power_divergenceResult(statistic=0.7721754541426672, pvalue=0.3795448984353682),
 Power_divergenceResult(statistic=0.35159094969782123, pvalue=0.5532139107905136),
 Power_divergenceResult(statistic=1.295042460408538, pvalue=0.25512076479610835),
 Power_divergenceResult(statistic=1.295042460408538, pvalue=0.25512076479610835),
 Power_divergenceResult(statistic=1.295042460408538, pvalue=0.25512076479610835),
 Power_divergenceResult(statistic=3.088701816570669, pvalue=0.07883768176626049),
 Power_divergenceResult(statistic=0.7721754541426672, pvalue=0.3795448984353682),
 Power_divergenceResult(statistic=1.295042460408538, pvalue=0.25512076479610835),
 Power_divergenceResult(statistic=0.7721754541426672, pvalue=0.3795448984353682),
 Power_divergenceResult(statistic=1.6685296771805387, pvalue=0.1964555878559714)]

### This shows that the mean of sqared differences between the observed and expected frequencies of these terms are all very small. So ideally, if the words were a deciding factor in the value of the question, there would be significant differences in the observed and expected values, which is not the case here.