In [100]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

%config InlineBackend.figure_format = 'retina';

## Load in the Questions

In [101]:
df = pd.read_json('JEOPARDY_QUESTIONS1.json')
f"{df.shape[0] - df['question'].nunique()} questions are repeats."

'798 questions are repeats.'

In [91]:
f"There are {df['category'].nunique():,} unique categories in Jeopardy!"

'There are 27,995 unique categories in Jeopardy!'

In [92]:
# bar plot of categories
counts = df['category'].value_counts()

counts.describe()

count    27995.000000
mean         7.748884
std         17.001310
min          1.000000
25%          5.000000
50%          5.000000
75%          5.000000
max        547.000000
Name: category, dtype: float64

In [93]:
print(df.iloc[np.random.choice(range(df.shape[0]), 1)][['question','answer']].values)

[["'These nodes are glands, & they aren't just in the head & neck; there are more under the arms,too'"
  'lymph nodes']]


## Data Preprocessing

In [107]:
df.isna().sum()

category       0
air_date       0
question       0
value          0
answer         0
round          0
show_number    0
dtype: int64

In [105]:
df = df.fillna(0)

Convert value from bad string to float (because there are NaNs)

In [109]:
df.loc[:, 'value'] = df['value'].str[1:].str.replace(',','').astype(np.float32)

Convert air_date to datetime and extract year

In [146]:
df.loc[:, 'air_date'] = pd.to_datetime(df.loc[:, 'air_date'])
df.loc[:, 'year'] = df['air_date'].dt.year

TFIDF

In [65]:
questions = df['question']

vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words='english', max_df=0.95)  # we may need to find better stop words
tfidf_qs = vectorizer.fit_transform(questions)

In [66]:
tfidf_qs.shape

(216930, 90165)

In [135]:
sorted(vectorizer.get_feature_names())[:25]

['00',
 '000',
 '0000',
 '0003',
 '0005',
 '000529',
 '000m',
 '000th',
 '001',
 '002',
 '0025',
 '003',
 '004',
 '006',
 '00601',
 '00698',
 '007',
 '00th',
 '01',
 '01000001',
 '011',
 '012',
 '013',
 '014',
 '015']

In [68]:
# look at the non-words in an individual question
np_tfidf = tfidf_qs.toarray()
q1 = np.argwhere(np_tfidf[0] > 0).flatten()

idx_to_word = {idx: word for word, idx in vectorizer.vocabulary_.items()}
list(map(idx_to_word.get, q1))

['arrest', 'espousing', 'galileo', 'house', 'life', 'man', 'theory', 'years']

In [71]:
df.iloc[0, 2]

"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'"

In [73]:
model = NMF(n_components=25, init='random', max_iter=400)
W = model.fit_transform(tfidf_qs)  # document-topic, rows=document, cols=topics
H = model.components_  # topic-term, rows=topics, cols=terms



### Create topic column

In [74]:
W.shape

(216930, 25)

In [75]:
H.shape

(25, 90165)

In [76]:
W[0,:] # first document

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.04512061e-04,
       2.47479757e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.92268296e-04,
       0.00000000e+00, 1.27673479e-05, 3.26884704e-02, 0.00000000e+00,
       6.99097024e-05, 0.00000000e+00, 0.00000000e+00, 5.00778299e-02,
       9.16585117e-04])

In [136]:
topics = np.apply_along_axis(np.argmax, axis=1, arr=W)
df.loc[:, 'topic'] = topics

# 2: Difficulty of Questions Over Time
Need to divide 'Double Jeopardy!' category by 2

In [111]:
df.loc[df['round'] == 'Double Jeopardy!', 'value'] /= 2

Add rank of difficulty per value

In [133]:
df.loc[:, 'difficulty'] = df.groupby(['show_number']).rank(method='dense')['value']

In [147]:
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number,difficulty,topic,year
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",200.0,Copernicus,Jeopardy!,4680,1.0,23,2004
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,200.0,Jim Thorpe,Jeopardy!,4680,1.0,13,2004
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,200.0,Arizona,Jeopardy!,4680,1.0,9,2004
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",200.0,McDonald\'s,Jeopardy!,4680,1.0,8,2004
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",200.0,John Adams,Jeopardy!,4680,1.0,8,2004
