In [1]:


import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalMaxPooling1D, LSTM, Bidirectional, Embedding, Dropout

In [4]:
df = pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")
df = df[df[' Value'] != 'None']

In [5]:
display(df.head(3))
display(df.tail(3))

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,In North America this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,$2000,"From Ft. Sill, Okla. he made the plea, Arizona...",Geronimo


In [6]:
print("Dataset has " + str(df.shape[0]) + " rows \nAnd " + str(df.shape[1]) + " columns")
print("\nFeatures :\n", df.columns.tolist())
print("\nMissing Values :", df.isnull().values.sum())

Dataset has 213296 rows 
And 7 columns

Features :
 ['Show Number', ' Air Date', ' Round', ' Category', ' Value', ' Question', ' Answer']

Missing Values : 2


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213296 entries, 0 to 216928
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  213296 non-null  int64 
 1    Air Date    213296 non-null  object
 2    Round       213296 non-null  object
 3    Category    213296 non-null  object
 4    Value       213296 non-null  object
 5    Question    213296 non-null  object
 6    Answer      213294 non-null  object
dtypes: int64(1), object(6)
memory usage: 13.0+ MB


In [8]:
df.isnull().sum()

Show Number    0
 Air Date      0
 Round         0
 Category      0
 Value         0
 Question      0
 Answer        2
dtype: int64

## Answer

In [9]:
df[' Answer'].value_counts().to_frame()

Unnamed: 0,Answer
China,212
Australia,211
Japan,193
France,191
Chicago,189
...,...
Anna Livia Plurabelle,1
Soho,1
the Epstein-Barr virus,1
the Second Vatican Council (Vatican II),1


In [10]:
df[' Answer'] = df[' Answer'].fillna(df[' Answer'].mode()[0])

## Air Date

In [11]:
df[' Air Date'] = pd.to_datetime(df[' Air Date'])

## Round

In [12]:
df[' Round'].value_counts().to_frame()

Unnamed: 0,Round
Jeopardy!,107384
Double Jeopardy!,105912


## Category

In [13]:
df[' Category'].value_counts().to_frame()

Unnamed: 0,Category
BEFORE & AFTER,547
SCIENCE,513
LITERATURE,486
AMERICAN HISTORY,401
POTPOURRI,401
...,...
GOING TO THE LOO-VRE,1
9-LETTER POTPOURRI,1
I'M MORE THAN JUST A SECRETARY,1
19th CENTURY LIFE,1


## Answer

In [14]:
df[' Answer'].value_counts().to_frame()

Unnamed: 0,Answer
China,214
Australia,211
Japan,193
France,191
Chicago,189
...,...
Anna Livia Plurabelle,1
Soho,1
the Epstein-Barr virus,1
the Second Vatican Council (Vatican II),1


## Value

In [15]:
df['ValueNum'] = df[' Value'].apply(
    lambda value: int(value.replace(',', '').replace('$', ''))
)

In [16]:
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

df['ValueBins'] = df['ValueNum'].apply(binning)

In [17]:
print("Total number of categories:", df[' Value'].unique().shape[0])
print("Number of categories after binning:", df['ValueBins'].unique().shape[0])
print("\nBinned Categories:", df['ValueBins'].unique())

Total number of categories: 149
Number of categories after binning: 21

Binned Categories: [  200   400   600   800  2000  1000  3000  5000   100   300   500  4000
  7000   700  8000  6000 10000   900  9000     0 20000]


In [18]:
show_numbers = df['Show Number'].unique()
train_shows, test_shows = train_test_split(show_numbers, test_size=0.2, random_state=2019)

train_mask = df['Show Number'].isin(train_shows)
test_mask = df['Show Number'].isin(test_shows)

train_labels = df.loc[train_mask, 'ValueBins']
train_questions = df.loc[train_mask, ' Question']
test_labels = df.loc[test_mask, 'ValueBins']
test_questions = df.loc[test_mask, ' Question']

## Simple Linear Model

In [19]:
%%time
bow = CountVectorizer(stop_words='english', max_features=2000)
bow.fit(df[' Question'])

CPU times: user 4.93 s, sys: 70.2 ms, total: 5 s
Wall time: 5.01 s


CountVectorizer(max_features=2000, stop_words='english')

In [20]:
X_train = bow.transform(train_questions)
X_test = bow.transform(test_questions)

y_train = train_labels
y_test = test_labels

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (170704, 2000)
Shape of X_test: (42592, 2000)
Shape of y_train: (170704,)
Shape of y_test: (42592,)


###  Logistic Regression

In [21]:
%%time
lr = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
lr.fit(X_train, y_train)

CPU times: user 1min 15s, sys: 59.3 ms, total: 1min 15s
Wall time: 1min 15s


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga')

In [22]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
         100       0.05      0.00      0.01      1863
         200       0.18      0.14      0.15      6132
         300       0.06      0.00      0.01      1801
         400       0.21      0.57      0.30      8425
         500       0.10      0.01      0.02      1827
         600       0.11      0.01      0.02      4099
         700       0.00      0.00      0.00        41
         800       0.15      0.10      0.12      6279
         900       0.00      0.00      0.00        28
        1000       0.19      0.20      0.20      6720
        2000       0.19      0.10      0.13      4938
        3000       0.00      0.00      0.00       198
        4000       0.00      0.00      0.00       121
        5000       0.00      0.00      0.00        61
        6000       0.00      0.00      0.00        21
        7000       0.00      0.00      0.00         9
        8000       0.00    

## Tokenization

In [23]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(df[' Question'])

train_sequence = tokenizer.texts_to_sequences(train_questions)
test_sequence = tokenizer.texts_to_sequences(test_questions)

print("Original text:", train_questions[0])
print("Converted sequence:", train_sequence[0])

Original text: For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory
Converted sequence: [7, 1, 112, 272, 102, 4, 14, 189, 7842, 9, 226, 173, 5422, 7, 41554, 2, 571, 1552]


In [24]:
X_train = pad_sequences(train_sequence, maxlen=50)
X_test = pad_sequences(test_sequence, maxlen=50)

print(X_train.shape)
print(X_test.shape)

(170704, 50)
(42592, 50)


In [25]:
le = LabelEncoder()
le.fit(df['ValueBins'])

y_train = le.transform(train_labels)
y_test = le.transform(test_labels)

print(y_train.shape)
print(y_test.shape)

(170704,)
(42592,)


In [26]:
num_words = tokenizer.num_words
output_size = len(le.classes_)

In [27]:
model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=200, 
              mask_zero=True, 
              input_length=50),
    Bidirectional(LSTM(150, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(300, activation='relu'),
    Dropout(0.5),
    Dense(output_size, activation='softmax')
    
])

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 200)           10000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 300)           421200    
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               90300     
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                6321      
Total params: 10,517,821
Trainable params: 10,517,821
Non-trainable params: 0
____________________________________________

In [28]:
model.fit(X_train, y_train, epochs=10, batch_size=1024, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f32866a7990>

In [29]:
y_pred = model.predict(X_test, batch_size=1024).argmax(axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.07      0.05      0.06      1863
           2       0.18      0.18      0.18      6132
           3       0.05      0.03      0.03      1801
           4       0.21      0.23      0.22      8425
           5       0.06      0.05      0.05      1827
           6       0.11      0.11      0.11      4099
           7       0.00      0.00      0.00        41
           8       0.16      0.19      0.17      6279
           9       0.00      0.00      0.00        28
          10       0.19      0.18      0.19      6720
          11       0.17      0.18      0.17      4938
          12       0.01      0.01      0.01       198
          13       0.00      0.00      0.00       121
          14       0.00      0.00      0.00        61
          15       0.00      0.00      0.00        21
          16       0.00      0.00      0.00         9
          17       0.00    