# Cleaning the raw dataset

In this notebook, I'm just taking a quick look at the data and doing some basic cleaning.

## Importing libraries

In [1]:
import pandas as pd

## Loading the data

In [2]:
# Need to set sep='\t' because file is .tsv and not .csv
df = pd.read_csv('../data/master_season1-35.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,100,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-
1,1,200,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-
2,1,400,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-
3,1,500,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-
4,1,100,no,INVENTIONS,-,Marconi's wonderful wireless,the radio,1984-09-10,-


In [4]:
df.shape

(349641, 9)

## Cleaning

In [5]:
# Answer and question headers are reversed, give them new names
new_headers = {
    'answer': 'clue',
    'question': 'correct_response',
}
df.rename(columns=new_headers, inplace=True)

In [6]:
# Don't need round 3 since that's final jeopardy
final_jeopardy_ix = df[df['round'] == 3].index
df.drop(index=final_jeopardy_ix, inplace=True)

In [7]:
# Scoring was changed on November 26, 2001.
# Source: https://en.wikipedia.org/wiki/Jeopardy!
# Remove clues from games played before current system.
is_old_system = df['air_date'] < '2001-11-26'
old_system_ix = df[is_old_system].index
df.drop(index=old_system_ix, inplace=True)

In [8]:
# Remove clues from special games which may have alternate rules
is_special_game = df['notes'] != '-'
special_game_ix = df[is_special_game].index
df.drop(index=special_game_ix, inplace=True)

# Remove superfluous notes column
df.drop(columns=['notes'], inplace=True)

## Feature engineering

In [9]:
# Dummify daily_double column
df['is_daily_double'] = df['daily_double'].map(lambda x: 1 if x == 'yes' else 0)

# Drop redundant daily_double column
df.drop(columns=['daily_double'], inplace=True)

In [10]:
# For some categories, Alex Trebek made a comment about the category.
# Create feature signaling if Alex gave a comment.
df['host_commented'] = df['comments'].map(lambda x: 0 if x == '-' else 1)

# Drop comments column
df.drop(columns=['comments'], inplace=True)

In [11]:
# Create feature signaling if value is not one of the regular Jeopardy clue values
regular_values = [200, 400, 600, 800, 1000, 1200, 1600, 2000]
df['is_irregular_value'] = df['value'].map(lambda x: 0 if x in regular_values else 1)

In [12]:
# Create feature for if a category is common
categories_list = list(df['category'])
categories_set = set(df['category'])
common_categories = [category for category in categories_set if categories_list.count(category) > 20]
df['is_common_category'] = df['category'].map(lambda x: 1 if x in common_categories else 0)

## Export clean dataframe

In [15]:
df.to_csv('../data/clean.csv', index=False)