In [1]:
import json
import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.similarities.annoy import AnnoyIndexer

%config InlineBackend.figure_format = 'retina';

In [2]:
df = pd.read_json("JEOPARDY_QUESTIONS1.json")

In [3]:
df

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680
...,...,...,...,...,...,...,...
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999


In [8]:
categories = set([i.lower() for i in df['category']])
categories = list(categories)
# categories

In [5]:
len(categories)

27916

In [6]:
len(list(set([i for i in df.category])))

27995

In [7]:
categories = [re.sub('[^a-zA-Z]', ' ', cat) for cat in categories]
categories = [re.sub(r'\s+', ' ', cat) for cat in categories]
categories = [c.strip() for c in categories]

In [8]:
category = [nltk.sent_tokenize(c) for c in categories] 

all_words = [nltk.word_tokenize(c) for cat in category for c in cat]

# Removing Stop Words
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [7]:
# all_words

In [10]:
word2vec = Word2Vec(all_words, min_count=2)

In [11]:
vocabulary = word2vec.wv
vocabulary

<gensim.models.keyedvectors.KeyedVectors at 0x7f95303e95e0>

In [12]:
vocabulary.most_similar('history')

[('movie', 0.9045835137367249),
 ('tv', 0.9029242396354675),
 ('american', 0.9016271233558655),
 ('names', 0.8976449966430664),
 ('world', 0.8957473635673523),
 ('name', 0.894204318523407),
 ('old', 0.8903971314430237),
 ('u', 0.8894762396812439),
 ('national', 0.8835700154304504),
 ('film', 0.8817870020866394)]

In [13]:
vocabulary.most_similar('world')

[('movie', 0.9339285492897034),
 ('u', 0.9282433986663818),
 ('tv', 0.9108729958534241),
 ('words', 0.910126805305481),
 ('names', 0.9076410531997681),
 ('state', 0.9053557515144348),
 ('sports', 0.9039403796195984),
 ('american', 0.8988625407218933),
 ('first', 0.8968532085418701),
 ('category', 0.8960384726524353)]

In [9]:
annoy_index = AnnoyIndexer(word2vec, 100)
# Derive the vector for the word "science" in our model
vector = vocabulary["history"]
# The instance of AnnoyIndexer we just created is passed
approximate_neighbors = vocabulary.most_similar([vector], topn=11, indexer=annoy_index)
# Neatly print the approximate_neighbors and their corresponding cosine similarity values
print("Approximate Neighbors")
for neighbor in approximate_neighbors:
    print(neighbor)

normal_neighbors = vocabulary.most_similar([vector], topn=11)
print("\nExact Neighbors")
for neighbor in normal_neighbors:
    print(neighbor)

Approximate Neighbors
('history', 0.9998273665114539)
('american', 0.7853281199932098)
('tv', 0.7845088094472885)
('movie', 0.7809436619281769)
('old', 0.7762740403413773)
('names', 0.7698773592710495)
('u', 0.7675094604492188)
('world', 0.7663684487342834)
('sports', 0.7643802464008331)
('name', 0.7629969716072083)
('presidential', 0.7625374346971512)

Exact Neighbors
('history', 1.0)
('american', 0.9078319668769836)
('tv', 0.9071270227432251)
('movie', 0.9040284752845764)
('old', 0.8998932838439941)
('names', 0.8940871953964233)
('u', 0.8918962478637695)
('world', 0.8908324837684631)
('sports', 0.8889665007591248)
('name', 0.8876591324806213)
('presidential', 0.8872230052947998)


In [None]:
def summary(x:str):
    annoy_index = AnnoyIndexer(word2vec, 100)
    vector = vocabulary[x]
    # The instance of AnnoyIndexer we just created is passed
    approximate_neighbors = vocabulary.most_similar([vector], topn=11, indexer=annoy_index)
    # Neatly print the approximate_neighbors and their corresponding cosine similarity values
    print("Approximate Neighbors")
    for neighbor in approximate_neighbors:
        print(neighbor)

    normal_neighbors = vocabulary.most_similar([vector], topn=11)
    print("\nExact Neighbors")
    for neighbor in normal_neighbors:
        print(neighbor)

In [None]:
summary('science')

## From the .py file

In [3]:
from jeopardy_funcs import *

In [4]:
word2vec, vocabulary = w2v(df)

In [6]:
summary('history', word2vec, vocabulary)

Approximate Neighbors
('history', 0.9998273665114539)
('american', 0.7853281199932098)
('tv', 0.7845088094472885)
('movie', 0.7809436619281769)
('old', 0.7762740403413773)
('names', 0.7698773592710495)
('u', 0.7675094604492188)
('world', 0.7663684487342834)
('sports', 0.7643802464008331)
('name', 0.7629969716072083)
('presidential', 0.7625374346971512)

Exact Neighbors
('history', 1.0)
('american', 0.9078319668769836)
('tv', 0.9071270227432251)
('movie', 0.9040284752845764)
('old', 0.8998932838439941)
('names', 0.8940871953964233)
('u', 0.8918962478637695)
('world', 0.8908324837684631)
('sports', 0.8889665007591248)
('name', 0.8876591324806213)
('presidential', 0.8872230052947998)
