In [51]:
import json
import pandas as pd
import numpy as np
import missingno
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [52]:
data_df = pd.read_csv('JEOPARDY_CSV.csv')

In [53]:
data_df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [54]:
data_df = data_df[data_df[' Value']!='None']

In [55]:
data_df.isnull().sum()

Show Number    0
 Air Date      0
 Round         0
 Category      0
 Value         0
 Question      0
 Answer        2
dtype: int64

In [56]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213296 entries, 0 to 216928
Data columns (total 7 columns):
Show Number    213296 non-null int64
 Air Date      213296 non-null object
 Round         213296 non-null object
 Category      213296 non-null object
 Value         213296 non-null object
 Question      213296 non-null object
 Answer        213294 non-null object
dtypes: int64(1), object(6)
memory usage: 13.0+ MB


In [57]:
data_df['ValueNum'] = data_df[' Value'].apply(
    lambda value: int(value.replace(',', '').replace('$', ''))
)

In [58]:
data_df.shape

(213296, 8)

In [59]:
data_df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,ValueNum
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200


In [60]:
data_df

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,ValueNum
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200
...,...,...,...,...,...,...,...,...
216924,4999,2006-05-11,Double Jeopardy!,OFF-BROADWAY,$2000,In 2006 the cast of this long-running hit emba...,Stomp,2000
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,This Puccini opera turns on the solution to 3 ...,Turandot,2000
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,In North America this term is properly applied...,a titmouse,2000
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker,2000


In [61]:
def binning(value):
    if value < 1000:
        return np.round(value,-2)
    elif value < 10000:
        return np.round(value,-3)
    else:
        return np.round(value,-4)
    
data_df['ValueBins'] = data_df['ValueNum'].apply(binning)

In [62]:
data_df.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,ValueNum,ValueBins
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,200,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,200,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200,200


In [63]:
show_numbers = data_df['Show Number'].unique()
train_shows,test_shows = train_test_split(show_numbers,test_size=0.2,random_state=2019)

train_mask = data_df['Show Number'].isin(train_shows)
test_mask = data_df['Show Number'].isin(test_shows)

train_labels = data_df.loc[train_mask,'ValueBins']
train_questions = data_df.loc[train_mask,' Question']
test_labels = data_df.loc[test_mask,'ValueBins']
test_questions = data_df.loc[test_mask,' Question']

In [64]:
%%time
bow = CountVectorizer(stop_words='english',max_features=2000)
bow.fit(data_df[' Question'])

Wall time: 7.42 s


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=2000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [65]:
X_train = bow.transform(train_questions)
X_test = bow.transform(test_questions)

y_train = train_labels
y_test = test_labels

print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of y_test:',y_test.shape)

Shape of X_train: (170704, 2000)
Shape of X_test: (42592, 2000)
Shape of y_train: (170704,)
Shape of y_test: (42592,)


In [66]:
%%time
lr = LogisticRegression(solver='saga',multi_class='multinomial',max_iter=200)
lr.fit(X_train,y_train)

Wall time: 48.5 s
Parser   : 151 ms




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
y_pred = lr.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
         100       0.05      0.00      0.01      1863
         200       0.18      0.14      0.16      6132
         300       0.06      0.00      0.01      1801
         400       0.21      0.57      0.30      8425
         500       0.10      0.01      0.02      1827
         600       0.11      0.01      0.02      4099
         700       0.00      0.00      0.00        41
         800       0.15      0.10      0.12      6279
         900       0.00      0.00      0.00        28
        1000       0.19      0.20      0.20      6720
        2000       0.19      0.10      0.13      4938
        3000       0.00      0.00      0.00       198
        4000       0.00      0.00      0.00       121
        5000       0.00      0.00      0.00        61
        6000       0.00      0.00      0.00        21
        7000       0.00      0.00      0.00         9
        8000       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [69]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(data_df[' Question'])

train_sequence = tokenizer.texts_to_sequences(train_questions)
test_sequence = tokenizer.texts_to_sequences(test_questions)

print('Question text:',train_sequence[0])
print('Converted sequence:',train_sequence[0])

Question text: [7, 1, 112, 272, 102, 4, 14, 189, 7842, 9, 226, 173, 5422, 7, 41554, 2, 571, 1552]
Converted sequence: [7, 1, 112, 272, 102, 4, 14, 189, 7842, 9, 226, 173, 5422, 7, 41554, 2, 571, 1552]


In [70]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [71]:
X_train = pad_sequences(train_sequence,maxlen=50)
X_test = pad_sequences(test_sequence,maxlen=50)

print(X_train.shape)
print(X_test.shape)

(170704, 50)
(42592, 50)


In [72]:
from sklearn.preprocessing import LabelEncoder

In [73]:
le = LabelEncoder()
le.fit(data_df['ValueBins'])

y_train = le.transform(train_labels)
y_test = le.transform(test_labels)

print(y_train.shape)
print(y_test.shape)

(170704,)
(42592,)


In [74]:
num_words = tokenizer.num_words
output_size = len(le.classes_)

In [75]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalMaxPooling1D, LSTM, Bidirectional, Embedding, Dropout

In [76]:
model = Sequential([
        Embedding(input_dim=num_words,
            output_dim=200,
            mask_zero = True,
            input_length=50),
        Bidirectional(LSTM(150,return_sequences=True)),
        GlobalMaxPooling1D(),
        Dense(300,activation='relu'),
        Dropout(0.5),
        Dense(output_size,activation='softmax')
])

model.compile('adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 200)           10000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 300)           421200    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 300)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 300)               90300     
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 21)                6321      
Total params: 10,517,821
Trainable params: 10,517,821
Non-trainable params: 0
__________________________________________

In [77]:
model.fit(X_train,y_train,epochs=10,batch_size=1024,validation_split=0.1)

Train on 153633 samples, validate on 17071 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2885b2f9188>

In [78]:
y_pred = model.predict(X_test,batch_size=1024).argmax(axis=1)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.07      0.04      0.05      1863
           2       0.18      0.19      0.18      6132
           3       0.05      0.03      0.04      1801
           4       0.21      0.19      0.20      8425
           5       0.06      0.06      0.06      1827
           6       0.11      0.12      0.11      4099
           7       0.00      0.00      0.00        41
           8       0.15      0.14      0.14      6279
           9       0.00      0.00      0.00        28
          10       0.18      0.25      0.21      6720
          11       0.17      0.17      0.17      4938
          12       0.02      0.01      0.01       198
          13       0.00      0.00      0.00       121
          14       0.00      0.00      0.00        61
          15       0.00      0.00      0.00        21
          16       0.00      0.00      0.00         9
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
