In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import os
import re
import pandas as pd
from string import punctuation

qa = pd.read_json('jeopardy.json')

english_stopwords = set(stopwords.words('english') + list(punctuation))

lemmatizer = WordNetLemmatizer()


In [2]:
qa

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680
...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999


In [3]:
qa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     216930 non-null  object
 1   air_date     216930 non-null  object
 2   question     216930 non-null  object
 3   value        213296 non-null  object
 4   answer       216930 non-null  object
 5   round        216930 non-null  object
 6   show_number  216930 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


Getting an idea of what the value column looks like.

In [4]:
qa['value'].value_counts()

value
$400       42244
$800       31860
$200       30455
$600       20377
$1000      19539
           ...  
$5,401         1
$1,183         1
$1,203         1
$11,600        1
$11,200        1
Name: count, Length: 149, dtype: int64

In [5]:
qa

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680
...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999


Changing value column datatype to string

In [6]:
qa['value'] = qa['value'].astype(str)

Adding column "value indicator" where 1 = high value and 0 = low value. Everything less than or equal to $600 is considered low value. Everything over $600 is high value.

In [7]:
qa['value indicator'] = qa['value'].apply(lambda x: 0 if x == '$200' or x == '$400' or x == 'None' or x == '$600' else 1)


Checking new column

In [8]:
qa

Unnamed: 0,category,air_date,question,value,answer,round,show_number,value indicator
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,0
...,...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999,1
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999,1
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999,1
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999,1


Cleaning and lemmatizing the text in "question" column. 

In [9]:
def clean_qs(question):
    question = question.lower()
    toke_q = word_tokenize(question)
    
    wordlist = []
    for word in toke_q:
        if word not in english_stopwords:
            wordlist.append(word)
    
    wordlist2 = []
    for eachword in wordlist:
        wordlist2.append(lemmatizer.lemmatize(eachword))
    question = [' '.join(wordlist2)]
    question = str(question)
    
    return question

In [10]:
qa['question'] = qa['question'].apply(clean_qs)

In [11]:
qa

Unnamed: 0,category,air_date,question,value,answer,round,show_number,value indicator
0,HISTORY,2004-12-31,"[""'for last 8 year life galileo house arrest e...",$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,"[""'no 2 1912 olympian football star carlisle i...",$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,"[""'the city yuma state record average 4,055 ho...",$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,"[""'in 1963 live `` art linkletter show '' comp...",$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,"[""'signer dec. indep. framer constitution mass...",$200,John Adams,Jeopardy!,4680,0
...,...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,"[""'this puccini opera turn solution 3 riddle p...",$2000,Turandot,Double Jeopardy!,4999,1
216926,"""T"" BIRDS",2006-05-11,"[""'in north america term properly applied 4 sp...",$2000,a titmouse,Double Jeopardy!,4999,1
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"[""'in penny lane `` hellraiser '' grew barber ...",$2000,Clive Barker,Double Jeopardy!,4999,1
216928,QUOTATIONS,2006-05-11,"[""'from ft. sill okla. made plea arizona land ...",$2000,Geronimo,Double Jeopardy!,4999,1


Creating train and test sets for use with naive bayes.

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(qa.question, qa['value indicator'], random_state=1)

In [13]:
tfidf_vectorizer = TfidfVectorizer(use_idf = True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)


In [14]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, Y_train)
predictions = naive_bayes.predict(X_test_tf)


Looking at accuracy of the prediction of value based on contents of the quesion.

In [15]:
print('Accuracy: ', accuracy_score(Y_test, predictions))

Accuracy:  0.5525418103368798
