In [8]:
# Import required packages
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
# Setup file names
inFileName = 'JeopardyQuestions_Clean-Numbers.csv'
# Load cleaned dataset
df = pd.read_csv(inFileName)
# Look at data header
df.head(5)

Unnamed: 0.1,Unnamed: 0,category,air_date,question,value,answer,round,show_number,questionClean,categoryClean,combinedClean
0,0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",200.0,Copernicus,Jeopardy!,4680,last 8 year life galileo house arrest espousin...,history,history last 8 year life galileo house arrest ...
1,1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,200.0,Jim Thorpe,Jeopardy!,4680,2 1912 olympian football star carlisle indian ...,espn top 10 alltime athlete,espn top 10 alltime athlete 2 1912 olympian fo...
2,2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,200.0,Arizona,Jeopardy!,4680,city yuma state record average 4055 hour sunsh...,everybody talk,everybody talk city yuma state record average ...
3,3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",200.0,McDonald\'s,Jeopardy!,4680,1963 live art linkletter show company served b...,company line,company line 1963 live art linkletter show com...
4,4,EPITAPHS & TRIBUTES,2004-12-31,"Signer of the Declaration of Independence, fra...",200.0,John Adams,Jeopardy!,4680,signer declaration independence framer constit...,epitaph tribute,epitaph tribute signer declaration independenc...


In [10]:
# Look at data shape
df.shape

(206407, 11)

In [11]:
# Drop NA values from the cleaned questions 
df.dropna(subset=['questionClean'], inplace=True)
# Reset the dataframe index
df.reset_index(drop=True, inplace=True)
# Create a list of documents to perform LDA on. Each document is a cleaned/preprocessed/lemmatized question
docs = df['questionClean'].tolist()

In [12]:
# Review data shape after removing NA questions
df.shape

(206383, 11)

In [13]:
# Convert a collection of text documents to a matrix of token counts
# 2000 unigrams and bigrams will be used as tokens for LDA topic modeling
tf_vectorizer = CountVectorizer(max_features=2000, 
                                ngram_range=(1, 2),
                                stop_words = 'english')
# Vectorize the corpus of cleaned Jeopardy! questions. Creates document term matrix (tf)
tf = tf_vectorizer.fit_transform(docs)
# Get feature names (i.e., the 2000 unigrams and bigrams that will be used in the model)
tf_feature_names = tf_vectorizer.get_feature_names()

In [14]:
# LDA topic model to discover 10 topics 
# Document term matrix (tf) is used to fit the model
lda_10 = LatentDirichletAllocation(n_components=10, 
                                max_iter=200, 
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0,
                                verbose=1).fit(tf)

iteration: 1 of max_iter: 200
iteration: 2 of max_iter: 200
iteration: 3 of max_iter: 200
iteration: 4 of max_iter: 200
iteration: 5 of max_iter: 200
iteration: 6 of max_iter: 200
iteration: 7 of max_iter: 200
iteration: 8 of max_iter: 200
iteration: 9 of max_iter: 200
iteration: 10 of max_iter: 200
iteration: 11 of max_iter: 200
iteration: 12 of max_iter: 200
iteration: 13 of max_iter: 200
iteration: 14 of max_iter: 200
iteration: 15 of max_iter: 200
iteration: 16 of max_iter: 200
iteration: 17 of max_iter: 200
iteration: 18 of max_iter: 200
iteration: 19 of max_iter: 200
iteration: 20 of max_iter: 200
iteration: 21 of max_iter: 200
iteration: 22 of max_iter: 200
iteration: 23 of max_iter: 200
iteration: 24 of max_iter: 200
iteration: 25 of max_iter: 200
iteration: 26 of max_iter: 200
iteration: 27 of max_iter: 200
iteration: 28 of max_iter: 200
iteration: 29 of max_iter: 200
iteration: 30 of max_iter: 200
iteration: 31 of max_iter: 200
iteration: 32 of max_iter: 200
iteration: 33 of 

In [20]:
# Build function to display topics 
# Code adapted from the following source: https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        # print "Topic %d:" % (topic_idx)
        print(f'Topic {topic_idx}')
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [21]:
# Display the first n tokens of each topic
no_top_tokens = 100
display_topics(lda_10, tf_feature_names, no_top_tokens)

Topic 0
known novel say star group born character got home school end way english member red museum william story came large nickname named robert run tale poet march street want begin june prize force artist helped includes east thomas need 1994 highest product paris texas royal richard season writer senator giant open sitcom turned money 2006 start store developed christmas lived colony dutch master nobel stop bone mammal bank international met class final 1965 boston established ride bob pop stage lee martin religious future medical crime fan course 23 beer ohio detective instead near leg taken boy attack working 18th die
Topic 1
type time woman make company named life famous place man son water god long took brother popular men right mountain night high st served food introduced light plant hall help area fish charles month magazine horse 1997 dance live director prince originally 15 tom lord player according sun planet hold fame playing stone michael moon secretary hill louis base

In [22]:
# In order to come up with phrases to describe each of these ten topics, we used the following prompt in ChatGPT:
# Please come up with three word summaries that encapsulate the ideas in the following ten topics:
# And then we pasted in the top 100 n tokens for each topic for ChatGPT to summarize.
# The results are below: 
# Topic 0: "Literary legacy, hometown stories, notable characters."
# Topic 1: "Life stories, cultural icons, family moments."
# Topic 2: "State history, geographical tales, cultural significance."
# Topic 3: "Artistic expressions, creative works, cultural impact."
# Topic 4: "City narratives, historical events, cultural identity."
# Topic 5: "Historical periods, political shifts, regional influence."
# Topic 6: "Creative expressions, entertainment milestones, cultural symbols."
# Topic 7: "Leadership roles, cultural symbols, national identity."
# Topic 8: "Cultural touchstones, historical landmarks, identity expressions."
# Topic 9: "Language and leadership, entertainment milestones, historical events."

In [23]:
# Exporting topic predictions for each question using the 10-topic model
docsVStopics10 = lda_10.transform(tf)
docsVStopics10 = pd.DataFrame(docsVStopics10, columns=[str(i+1) for i in range(10)])
most_likely_topics10 = docsVStopics10.idxmax(axis=1)

In [25]:
# Show which topics have the most questions associated with them
most_likely_topics10.groupby(most_likely_topics10).count()

1     29316
10    19103
2     21641
3     22297
4     18433
5     18374
6     18579
7     20231
8     19794
9     18615
dtype: int64

In [26]:
# Add topic prediction labels for each question in the corpus and export labeled dataframe to csv
df['10_topic_predictions'] = most_likely_topics10
df.to_csv('JeopardyQuestions_Labeled.csv')

In [27]:
# Other resources used
# https://www.geeksforgeeks.org/multiclass-classification-using-scikit-learn/
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
# https://stackoverflow.com/questions/35252762/finding-number-of-documents-per-topic-for-lda-with-scikit-learn