## Analyzing and Cleaning Jeopardy! Dataset

In [2]:
# Opens and reads csv file
import pandas as pd
import random
pd.set_option('display.max_colwidth', None)

df = pd.read_csv("jeopardy.csv")
df1 = df.rename(columns={"Show Number":"show_number", " Air Date":"air_date", " Round":"round", " Category":"category", " Value": "value", " Question":"question", " Answer":"answer"})
print(df1.head())

   show_number    air_date      round                         category value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                                                                                      question  \
0             For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory   
1  No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves   
2                     The city of Yuma in this state has a record average of 4,055 hours of sunshine each year   
3                         In 1963, live on "The Art Linkletter 

In [3]:
# Convert value column from str to float
df1["value"] = df1.value.apply(lambda x: "0" if x == "None" else x.replace(",","")) 
df1["value"] = df1.value.apply(lambda x:float(x.replace("$",""))) 
print(df1.value)

0          200.0
1          200.0
2          200.0
3          200.0
4          200.0
           ...  
216925    2000.0
216926    2000.0
216927    2000.0
216928    2000.0
216929       0.0
Name: value, Length: 216930, dtype: float64


In [4]:
# Creates a dataframe with the questions that contain the words in input_list.
input_list = ["king"]
boolean = df1.question.apply(lambda x: all(" " + word.lower() + " " in x.lower() for word in input_list))
filtered_df = df1[boolean]
print(filtered_df)


        show_number    air_date             round                    category  \
40             4680  2004-12-31  Double Jeopardy!  DR. SEUSS AT THE MULTIPLEX   
781            4335  2003-06-06         Jeopardy!                   MY PLACE?   
811            4335  2003-06-06  Double Jeopardy!                 "S"-OTERICA   
896            3834  2001-04-12         Jeopardy!          AIN'T THAT AMERICA   
1074           4085  2002-05-10         Jeopardy!                CENTRAL PARK   
...             ...         ...               ...                         ...   
216210         1251  1990-01-29         Jeopardy!                   THE BIBLE   
216232         1251  1990-01-29         Jeopardy!                THANKS, GUYS   
216445         3644  2000-06-08   Final Jeopardy!                   THE BIBLE   
216752         5070  2006-09-29         Jeopardy!                 SIGNING OFF   
216777         5070  2006-09-29  Double Jeopardy!             ANCIENT HISTORY   

         value  \
40      1

In [5]:
# Finds the average value of questions containing the words in input_list.
average_value = filtered_df.value.mean()
print(average_value)

806.9708846584547


In [6]:
# Number of unique answers to the questions containing the words in input_list.
nunique_ans = filtered_df.answer.nunique()
print(nunique_ans)

1040


In [7]:
# Most common category in whole dataset
most_common_cat = df1.category.mode()
print(most_common_cat)

0    BEFORE & AFTER
dtype: object


In [11]:
categories = df1.category.nunique()
print(categories)

27995


## Jeopardy! Quiz Yourself
 The game will ask questions to which the user must input an answer.

In [132]:
total_value = 0
randomizer = round(random.random()*216929)
random_question = df1.question.iloc[randomizer]
print("The category is {} and the question is worth ${}".format(df1.category.iloc[randomizer],df1.value.iloc[randomizer])) 
print("Question: " + random_question)
user_ans = input("Answer: ")

if user_ans.lower() == df1.answer.iloc[randomizer].lower():
    total_value = total_value + df1.value.iloc[randomizer]
    print("Correct Answer")
    print("Your total value is ${}".format(total_value))
else:
    print("Incorrect Answer")
    print("Your total value is ${}".format(total_value))
    print("The correct answer was {}".format(df1.answer.iloc[randomizer]))
    

The category is SEE "NN" and the question is worth $800.0
Question: Get thee to this synonym for a convent
Answer: na
Incorrect Answer
Your total value is $0
The correct answer was a nunnery
