# Project: Winning Jeopardy

Dataset of Jeopardy questions to figure out some patterns in the questions

In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')

Start cleaning the data

In [2]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [3]:
jeopardy.columns.str.strip()

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [4]:
jeopardy.columns = jeopardy.columns.str.strip()

In [5]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [6]:
jeopardy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel


In [7]:
import re

def texts(item):
    item = item.lower()
    item = re.sub("[^A-Za-z0-9\s]", "", item)
    return(item)

def values(item):

    item = re.sub("[^A-Za-z0-9\s]", "", item)
    if item == 'None':
        item = 0
    item = int(item)
    return(item)

In [8]:
jeopardy['clean_question'] = jeopardy['Question'].apply(texts)

In [9]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(texts)

In [10]:
jeopardy["clean_value"] = jeopardy["Value"].apply(values)

In [11]:
jeopardy["clean_value"].unique()


array([  200,   400,   600,   800,  2000,  1000,  1200,  1600,  3200,
           0,  5000,   100,   300,   500,  1500,  4800,  1800,  1100,
        2200,  3400,  3000,  4000,  6800,  1900,  3100,   700,  1400,
        2800,  8000,  6000,  2400, 12000,  3800,  2500,  6200, 10000,
        7000,  1492,  7400,  1300,  7200,  2600,  3300,  5400,  4500,
        2100,   900,  3600,  2127,   367,  4400,  3500,  2900,  3900,
        4100,  4600, 10800,  2300,  5600,  1111,  8200,  5800,   750,
        7500,  1700,  9000,  6100,  1020,  4700,  2021,  5200,  3389])

In [12]:
def dates(item):
    # 2004-12-31
    item = pd.to_datetime(item)
    return(item)
jeopardy["Air Date"] = jeopardy["Air Date"].apply(dates)


Data is clean now

In [13]:
def answers(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return(0)
    for word in split_answer:
        if word in split_question:
            match_count +=1
    return (match_count / len(split_answer))
            
jeopardy["answer_in_question"] = jeopardy.apply(answers, axis=1)


In [14]:
jeopardy["answer_in_question"].mean()

0.06049325706933587

In [15]:

jeopardy.sort_values(by="Air Date", ascending=True, inplace=True)

In [16]:
question_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [elemen for elemen in split_question if len(elemen) > 5]
        match_count = 0
        for word in split_question:
            if word in terms_used:
                match_count += 1
                
        for word in split_question:
            terms_used.add(word)
            
        if len(split_question) > 0:
            match_count /= len(split_question)
        question_overlap.append(match_count)

In [17]:
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.6876260592169802

In [18]:
jeopardy["question_overlap"].describe()

count    19999.000000
mean         0.687626
std          0.299117
min          0.000000
25%          0.500000
50%          0.750000
75%          1.000000
max          1.000000
Name: question_overlap, dtype: float64

Question overlap is about 70%

In [19]:

jeopardy[jeopardy['question_overlap']==1]

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap
18000,423,1986-04-23,Double Jeopardy!,BIRDS,$200,"According to Aesop, greed killed the goose tha...",laid the golden egg,according to aesop greed killed the goose that...,laid the golden egg,200,0.000000,1.0
17994,423,1986-04-23,Jeopardy!,THE KENNEDYS,$500,"Rose’s maiden name, it became the middle name ...",Fitzgerald,roses maiden name it became the middle name of...,fitzgerald,500,0.000000,1.0
17575,731,1987-11-09,Jeopardy!,UTAH,$500,"It's the world's largest natural stone arch, t...",Rainbow Bridge,its the worlds largest natural stone arch thou...,rainbow bridge,500,0.000000,1.0
7025,732,1987-11-10,Double Jeopardy!,"STARTS WITH ""P""",$200,Movie that featured the following music,Picnic,movie that featured the following music,picnic,200,0.000000,1.0
7024,732,1987-11-10,Double Jeopardy!,PRESIDENTS,$400,All elected Presidents who are members of this...,Whig,all elected presidents who are members of this...,whig,400,0.000000,1.0
6367,776,1988-01-11,Jeopardy!,THIS IS JEOPARDY!,$400,"In 1984, he made the music video ""I Lost On Je...","""Weird Al"" Yankovic",in 1984 he made the music video i lost on jeop...,weird al yankovic,400,0.000000,1.0
6914,958,1988-11-02,Double Jeopardy!,U.S. STATES,$800,The last major land battle of the Revolutionar...,Virginia,the last major land battle of the revolutionar...,virginia,800,0.000000,1.0
6916,958,1988-11-02,Double Jeopardy!,TECHNOLOGY,$800,"The ""D"" in radar stands for this",detection,the d in radar stands for this,detection,800,0.000000,1.0
6886,958,1988-11-02,Jeopardy!,"""FOR"" WORDS",$400,Usually it's the last thing you're served in a...,a fortune cookie,usually its the last thing youre served in a c...,a fortune cookie,400,0.333333,1.0
6889,958,1988-11-02,Jeopardy!,TELEVISION,$400,"The pilot of this show, set in North Carolina,...",The Andy Griffith Show,the pilot of this show set in north carolina p...,the andy griffith show,400,0.333333,1.0


In [20]:
jeopardy["clean_value"].describe()

count    19999.000000
mean       748.336267
std        653.988299
min          0.000000
25%        400.000000
50%        600.000000
75%       1000.000000
max      12000.000000
Name: clean_value, dtype: float64

In [21]:
terms_used

{'fertig',
 'grandcamp',
 'sundays',
 'portugal',
 'soupedup',
 'hrefhttpwwwjarchivecommedia20091014j08wmvkelly',
 'firecracker',
 'kismets',
 'rebecca',
 'centurys',
 'taller',
 'prizewinner',
 'costome',
 'shilling',
 'producers',
 'armistice',
 'hrefhttpwwwjarchivecommedia20040628j11ajpg',
 'speaks',
 'starved',
 'allegiance',
 'honest',
 'manhattanmanhattan',
 'downcourt',
 'meegeren',
 'glimpse',
 'experienced',
 'pavlova',
 'fabled',
 'bugeaters',
 'radioactive',
 'ferroalloy',
 'tombaugh',
 'afraid',
 'artist',
 'roosevelts',
 'boltoni',
 'sovietmade',
 'augustus',
 'readily',
 'parris',
 'strands',
 'bather',
 'booths',
 'lithophone',
 'daddyo',
 '52foot',
 'ganges',
 'directing',
 'excavation',
 'molvania',
 'khadijah',
 'pignford',
 'austriaa',
 'switzer',
 'savoycarignano',
 'hotshot',
 'ranelagh',
 'wubbulous',
 'practicing',
 'fantastique',
 'baltic',
 'ideology',
 'corniche',
 'moppet',
 'absorbed',
 'willis',
 'spelled',
 'pollenated',
 'bitesized',
 'jangle',
 'hrefhttp

In [22]:
def values(item):
    if item > 800:
        return(1)
    else:
        return(0)
    
jeopardy["high_value"] = jeopardy["clean_value"].apply(values)


In [23]:
jeopardy["high_value"].describe()

count    19999.000000
mean         0.286714
std          0.452238
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: high_value, dtype: float64

In [28]:


def counts(word):
    low_count = 0
    high_count =0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1

    return(high_count, low_count)

In [25]:
comparison_terms = list(terms_used)[:5]

In [26]:
comparison_terms

['fertig', 'grandcamp', 'sundays', 'portugal', 'soupedup']

In [29]:
observed_expected = []
for a in comparison_terms:
    observed_expected.append(counts(a))

In [30]:
observed_expected

[(0, 1), (0, 1), (1, 0), (1, 3), (0, 2)]

compute the expected counts and the chi-squared value.

In [41]:
high_value_count = jeopardy["high_value"].value_counts()[1]
low_value_count = jeopardy["high_value"].value_counts()[0]

In [42]:
jeopardy["high_value"].value_counts()

0    14265
1     5734
Name: high_value, dtype: int64

In [44]:
chi_squared=[]

In [52]:

from scipy.stats import chisquare
import numpy as np
for a in observed_expected:
    high = a[0]
    low = a[1]
    total= high + low
    total_prop = total/len(jeopardy)
    high_value_rows = total_prop * high_value_count
    low_value_rows = total_prop * low_value_count
    
    observed = np.array([high, low])
    expected = np.array([high_value_rows, low_value_rows])
    chi_squared.append(chisquare(observed, expected))

In [53]:
chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.02636443308440769, pvalue=0.871013484688921),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571)]

Very weak p-values not proving so much