In [3]:
import re
import string
import numpy as np
import scipy.stats as stats
import pandas as pd


In [4]:
df = pd.read_csv('jeopardy.csv')
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
columns = []
pattern = re.compile(r'\s+')
for each in df.columns:
    sentence = re.sub(pattern, '', each)
    columns.append(sentence)
df.columns = columns
print(df.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [6]:
def norm_text(col):
    col = col.lower()
    col = re.sub(r'[^\w\s]','',col)
    return col
df["clean_question"] = df["Question"].apply(norm_text) 

In [7]:
df["clean_answer"] = df["Answer"].apply(norm_text)

In [15]:
def norm_int(col):
    col = re.sub(r'[\w\s]',"",col)
    try:
        col = int(col)
    except Exception:
        col = 0
    return col
df["clean_value"] = df["Value"].apply(norm_int)

In [16]:
air_date = df["AirDate"]
pd.to_datetime(air_date).head()

19325   1984-09-21
19301   1984-09-21
19302   1984-09-21
19303   1984-09-21
19304   1984-09-21
Name: AirDate, dtype: datetime64[ns]

In [17]:
def answer_in_ques(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0 
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for each in split_answer:
        if each in split_question:
            match_count += 1
    res = match_count / len(split_answer)
    return res
df["answer_in_question"] = df.apply(answer_in_ques,axis=1)

df["answer_in_question"].mean()

0.060493257069335879

In [18]:
df.sort_values('AirDate', ascending=True, inplace=True)
df.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer,answer_in_question,question_overlap,clean_value
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0.0,0.0,0
19274,10,1984-09-21,Jeopardy!,GEOGRAPHY,$100,Formerly Formosa,Taiwan,formerly formosa,taiwan,0.0,0.0,0
19275,10,1984-09-21,Jeopardy!,DOUBLE TALK,$100,"Not a Hawaiian cow, but a dress worn by Hawaii...",a muumuu,not a hawaiian cow but a dress worn by hawaiia...,a muumuu,0.5,0.5,0
19276,10,1984-09-21,Jeopardy!,"""JACKS"" OF ALL TRADES",$100,He celebrated his 39th birthday 41 times,Jack Benny,he celebrated his 39th birthday 41 times,jack benny,0.0,0.0,0
19277,10,1984-09-21,Jeopardy!,SHIPS,$100,"""Unsinkable"" for most of its maiden voyage in ...",the Titanic,unsinkable for most of its maiden voyage in 1912,the titanic,0.0,0.333333,0


In [19]:
question_overlap = []
terms_used = []
for i, row in df.iterrows():
  split_question = row['clean_question'].split(' ')
  split_question = [q for q in split_question if len(q) > 5]
  match_count = 0
  for q in split_question:
    if q in terms_used:
      match_count += 1
    else: terms_used.append(q)
  if len(split_question) > 0:
    question_overlap.append(match_count / len(split_question))
  else:
    question_overlap.append(0)
df['question_overlap'] = question_overlap
df['question_overlap'].mean()

0.68949473172267706

In [20]:
def value_800(row):
  value = 0
  if row['clean_value'] > 800:
    value = 1
  else:
    value = 0
  return(value)

In [21]:
df['high_value'] = df.apply(lambda row: value_800(row), axis=1)
df['high_value'].head()

19325    0
19274    0
19275    0
19276    0
19277    0
Name: high_value, dtype: int64

In [22]:
def question_repeat(str):
  low_count = 0
  high_count = 0
  for i, row in df.iterrows():
    if str in row['clean_question'].split(' '):
      if row['high_value']:
        high_count += 1
      else:
        low_count += 1
  return(high_count, low_count)

In [23]:
observed_expected = []
comparison_terms = terms_used[:10]
for term in comparison_terms:
  observed_expected.append(question_repeat(term))
observed_expected

[(0, 3),
 (0, 249),
 (0, 5),
 (0, 6),
 (0, 13),
 (0, 3),
 (0, 11),
 (0, 21),
 (0, 24),
 (0, 2)]

In [24]:
high_value_count = len(df[df['high_value'] == 1])
high_value_count

0

In [26]:
low_value_count = len(df[df['high_value'] == 0])


In [28]:
chi_squared = []
for e in observed_expected:
  total = sum(e)
  total_prob = total / df.shape[0]
  expected_high_count = total_prob * high_value_count
  expected_low_count = total_prob * low_value_count
  expected = np.array([expected_high_count, expected_low_count])
  observed = np.array([e[0], e[1]])
  print(observed, expected)
  chi_squared.append(stats.chisquare(e, expected))
#chi_squared

[0 3] [ 0.  3.]
[  0 249] [   0.  249.]
[0 5] [ 0.  5.]
[0 6] [ 0.  6.]
[ 0 13] [  0.  13.]
[0 3] [ 0.  3.]
[ 0 11] [  0.  11.]
[ 0 21] [  0.  21.]
[ 0 24] [  0.  24.]
[0 2] [ 0.  2.]


  terms = (f_obs - f_exp)**2 / f_exp


[Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan),
 Power_divergenceResult(statistic=nan, pvalue=nan)]