In [248]:
# Import necessary packages

import pandas as pd 
import numpy as np 
import os 
import plotly.express as px
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [202]:
# Check working directory

print(os.getcwd())

C:\Users\imacd_0odruq3\Documents\Jeopardy


In [203]:
# Read all Jeopardy logs into a data frame

log_num = 1
log_list = []

while (os.path.exists('jeopardy_log_{}.xlsx'.format(log_num))):
    df = pd.read_excel('jeopardy_log_{}.xlsx'.format(log_num))
    df.columns = ['category', 'type', 'dad_win', 'ian_win', 'elizabeth_win']
    log_list.append(df)
    
    log_num+=1

raw_logs_df = pd.concat(log_list, ignore_index=True)

raw_logs_df['type'] = [x.strip() for x in raw_logs_df['type']]

raw_logs_df

Unnamed: 0,category,type,dad_win,ian_win,elizabeth_win
0,islands,teen,1,1,0.0
1,social media,teen,1,1,1.0
2,famous americans,old,0,0,0.0
3,authors,old,0,0,0.0
4,german americans,old,1,0,0.0
...,...,...,...,...,...
385,awards and honors,regular,0,0,0.0
386,famous do's and don'ts,regular,1,0,0.0
387,fictional languages,regular,0,0,0.0
388,19th century literature,regular,1,1,0.0


### Feature Engineering 

Since the only thing given on Jeopardy is the category (which is too specific to predict just on it's own), we'll be extracting some broader categorical features for each category. The broader categories will be:
- History 
- Music 
- Geography 
- Literature
- Science
- Medicine
- Technology
- Sports
- Movies/TV
- Theater
- Religion
- Business/Economics
- Mythology
- Words/Phrases
- Politics
- Art
- Poetry 
- Zoology
- Miscellaneous
- Famous People
- Famous Americans

Each of these broader categories may then be split into smaller sub categories. Each category can also be a part of multiple broader categories. A category such as 'Olympic Cities' may fall under both Sports and Geography, with a sub category of Olympics under Sports and Cities under Geography. 


In [204]:
# History

def history_categorize(category):
    substrings = ['history', 'historical', 'historic', 'war', 'presidents', 'ancient', 'historian', 'past', 'renaissance',
                 'medieval', 'periods', 'monuments', 'old', 'supreme court', 'landmark', 'diplomacy', 
                  'constitution', 'colonial', 'colonies', 'century', 'presidential']
    
    phrases = ['declaration of independence', 'supreme court cases']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Music

def music_categorize(category):
    substrings = ['music', 'musician', 'bands', 'songs', 'composers', 'singers', 'jazz',
                 'rap', 'blues', 'instruments', 'album', 'albums', 'rock']
    
    phrases = ['rock and roll', 'hip hop']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Music

def classical_music_categorize(category):
    substrings = ['composers', 'mozart']
    
    phrases = ['classical music']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Geography

def geography_categorize(category):
    substrings = ['geography', 'geographic', 'mountains', 'islands', 'country', 'continent', 'countries', 'capital', 
                 'natural', 'nature', 'map', 'lakse', 'rivers', 'oceans', 'volcanoes', 'city', 'landmarks', 'americas',
                 'places', 'antarctica', 'europe', 'asia', 'africa', 'world', 'cities', 'island', 'borders', 'continents',
                 'hemisphere']
    
    phrases = ['north america', 'south america']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Literature 

def literature_categorize(category):
    substrings = ['literature', 'lit', 'author', 'novel', 'book', 'writer', 'writing', 'fiction', 
                 'literary', 'epic', 'shakespeare', 'shakespearean', 'authors', 'bestsellers', 'novels', 'books',
                 'authors', 'writers', 'fictional']
    
    phrases = ['fairy tale', 'non-fiction']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Shakespeare (Shows up so often he should get his own category)

def shakespeare_categorize(category):
    substrings = ['shakespeare', 'shakespearean', 'shakespeare\'s']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Science 

def science_categorize(category):
    substrings = ['science', 'scientist', 'scientific', 'biology', 'biologist', 'biological', 'physics', 'physicist',
                 'chemistry', 'chemist', 'periodic table', 'elements', 'astronomy', 'geology', 'geologist', 'geological', 
                 'psychology', 'psychologist', 'astronomer', 'geological', 'psychological', 'scientists']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Medicine 

def medicine_categorize(category):
    substrings = ['medicine', 'medical', 'doctor', 'nurse', 'doctors', 'nurses']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Technology 

def technology_categorize(category):
    substrings = ['tech', 'computer', 'engineer', 'innovation', 'invention', 'data', 'internet', 'robot',
                 'computing', 'automation', 'technology', 'inventors', 'computers', 'innovations']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Sports 

def sports_categorize(category):
    substrings = ['sport', 'athlete', 'athletic', 'competition', 'football', 'soccer', 'baseball', 'basketball', 
                 'hockey', 'stadium', 'olympic', 'gymnast', 'tennis', 'athletes', 'sports', 'olympic', 'olympics']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Movies/TV

def movies_tv_categorize(category):
    substrings = ['movie', 'tv', 'television', 'actor', 'actress', 'film', 'hollywood', 'animation', 'oscar', 'oscars',
                 'emmy', 'emmys', 'movies', 'actors', 'actresses', 'films', 'screenwriters']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Theater

def theater_categorize(category):
    substrings = ['theater', 'theatrical', 'plays', 'drama', 'broadway', 'play', 'theaters', 'dramas', 
                 'playwright', 'playwrights', 'musicals']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Religion

def religion_categorize(category):
    substrings = ['religion', 'bible', 'christian', 'catholic', 'islam', 'jew', 'muslim', 'hindu', 'buddhism', 
                 'religous', 'judaism', 'christianity', 'catholicism', 'hinduism']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Business/Economics

def business_economics_categorize(category):
    substrings = ['business', 'economy', 'economics', 'industry', 'stock', 'corporation', 'corporate', 'brand', 'brands',
                 'logos', 'businesses', 'economies', 'industries', 'stocks', 'market', 'corporations']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Mythology

def mythology_categorize(category):
    substrings = ['myth', 'myths', 'mythology', 'mythological']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Words and Phrases

def words_phrases_categorize(category):
    substrings = ['phrase', 'word', 'noun', 'verb', 'adjective', 'etymology', 'phrases', 'words', 'nouns', 
                 'verbs', 'adjectives']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Politics

def politics_categorize(category):
    substrings = ['politics', 'political', 'government', 'president', 'senator', 'congress',
                  'senate', 'diplomat', 'diplomacy', 'governments', 'senators', 'diplomats', 'ambassadors']
    
    phrases = ['supreme court', 'secretaries of state', 'house of representatives']
    
    if any(sub in word_tokenize(category) for sub in substrings) or any(sub in category for sub in phrases):
        return 1
    else:
        return 0 
    
# Art 

def art_categorize(category):
    substrings = ['art', 'painting', 'mural', 'renaissance', 'artist', 'artists', 'paintings', 'murals']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Poetry

def poetry_categorize(category):
    substrings = ['poet', 'haikou', 'poetry', 'poets', 'haikous']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0 
    
# Zoology

def zoology_categorize(category):
    substrings = ['animal', 'mammal', 'bird', 'reptile', 'amphibian', 'zoo', 'insect', 'fish', 'vertebrate',
                 'invertebrate', 'dog', 'cat', 'breed', 'animals', 'mammals', 'birds', 'reptiles', 'amphibians', 
                  'zoos', 'insects', 'fishes', 'vertebrates', 'invertebrates', 'dogs', 'cats', 'breeds']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0
    
# Famous Americans

def famous_americans_categorize(category):
    substrings = ['americans', 'american men', 'american women', 'american names']
    
    if any(sub in category for sub in substrings):
        return 1
    else:
        return 0
    
# Famous People
    
def famous_people_categorize(category):
    substrings = ['famous names', 'famous men', 'famous women', 'famous names', 'notable names', 'notable women']
    
    if any(sub in category for sub in substrings):
        return 1
    else:
        return 0
    
# Opera

def opera_categorize(category):
    substrings = ['opera']
    
    if any(sub in word_tokenize(category) for sub in substrings):
        return 1
    else:
        return 0
    
# Function to apply categorization to a data frame

def categorize(original_df):
    
    df = original_df.copy()
    df['history'] = df.category.apply(lambda x: history_categorize(x))
    df['music'] = df.category.apply(lambda x: music_categorize(x))
    df['classical_music'] = df.category.apply(lambda x: classical_music_categorize(x))
    df['geography'] = df.category.apply(lambda x: geography_categorize(x))
    df['literature'] = df.category.apply(lambda x: literature_categorize(x))
    df['science'] = df.category.apply(lambda x: science_categorize(x))
    df['medicine'] = df.category.apply(lambda x: medicine_categorize(x))
    df['technology'] = df.category.apply(lambda x: technology_categorize(x))
    df['sports'] = df.category.apply(lambda x: sports_categorize(x))
    df['movies_tv'] = df.category.apply(lambda x: movies_tv_categorize(x))
    df['theater'] = df.category.apply(lambda x: theater_categorize(x))
    df['religion'] = df.category.apply(lambda x: religion_categorize(x))
    df['business_economics'] = df.category.apply(lambda x: business_economics_categorize(x))
    df['mythology'] = df.category.apply(lambda x: mythology_categorize(x))
    df['words_phrases'] = df.category.apply(lambda x: words_phrases_categorize(x))
    df['politics'] = df.category.apply(lambda x: politics_categorize(x))
    df['art'] = df.category.apply(lambda x: art_categorize(x))
    df['poetry'] = df.category.apply(lambda x: poetry_categorize(x))
    df['zoology'] = df.category.apply(lambda x: zoology_categorize(x))
    df['famous_americans'] = df.category.apply(lambda x: famous_americans_categorize(x))
    df['famous_people'] = df.category.apply(lambda x: famous_people_categorize(x))
    df['opera'] = df.category.apply(lambda x: opera_categorize(x))
    df['shakespeare'] = df.category.apply(lambda x: shakespeare_categorize(x))
    
    return df

In [205]:
# Creating categorized data frame

categorized_df = categorize(raw_logs_df)
categorized_df['miscellaneous'] = (categorized_df.drop(
    columns=['type', 'dad_win', 'ian_win', 'elizabeth_win']).sum(axis=1)==0).apply(lambda x: 1 if x is True else 0)
categorized_df

Unnamed: 0,category,type,dad_win,ian_win,elizabeth_win,history,music,classical_music,geography,literature,...,words_phrases,politics,art,poetry,zoology,famous_americans,famous_people,opera,shakespeare,miscellaneous
0,islands,teen,1,1,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,social media,teen,1,1,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,famous americans,old,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,authors,old,0,0,0.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,german americans,old,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,awards and honors,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
386,famous do's and don'ts,regular,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
387,fictional languages,regular,0,0,0.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
388,19th century literature,regular,1,1,0.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [206]:
# Take a look at the categories that couldnt be categorized

categorized_df[categorized_df.miscellaneous==1].tail(50)

Unnamed: 0,category,type,dad_win,ian_win,elizabeth_win,history,music,classical_music,geography,literature,...,words_phrases,politics,art,poetry,zoology,famous_americans,famous_people,opera,shakespeare,miscellaneous
49,games,regular,1,1,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
50,ranks and titles,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
60,famous russians,regular,0,0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
64,the 1960s,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
68,official state stuff,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
83,currency,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
85,special days,regular,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
86,snack foods,regular,1,1,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
89,royalty,college,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
92,names in the news,college,1,1,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [207]:
# Overall Winning Percentages

win_percentages = categorized_df[['dad_win', 'ian_win', 'elizabeth_win']].mean().reset_index()
win_percentages.columns = ['scenario', 'frequency']
px.bar(win_percentages, x='scenario', y='frequency')

In [208]:
categorized_df[['dad_win', 'ian_win', 'elizabeth_win']].mean().reset_index()

Unnamed: 0,index,0
0,dad_win,0.379487
1,ian_win,0.284615
2,elizabeth_win,0.150649


In [209]:
# Print categories and what final jeopardy clues fall into each one

category_list = [category for category in list(categorized_df.columns) 
                 if category not in ['category', 'dad_win', 'ian_win', 'elizabeth_win', 'type']]
category_list

['history',
 'music',
 'classical_music',
 'geography',
 'literature',
 'science',
 'medicine',
 'technology',
 'sports',
 'movies_tv',
 'theater',
 'religion',
 'business_economics',
 'mythology',
 'words_phrases',
 'politics',
 'art',
 'poetry',
 'zoology',
 'famous_americans',
 'famous_people',
 'opera',
 'shakespeare',
 'miscellaneous']

In [210]:
print_categories = category_list

for category in print_categories:
    print('Category: {}'.format(category))
    print()
    for cat in categorized_df[categorized_df[category]==1].category:
        print(cat)
    print()

Category: history

historic names 
historic places
20th century americans 
pre civil war presidents 
medical history 
19th century europeans 
20th century novels 
21st century music 
historic homes 
moments in history 
19th century books 
19th century americans 
film and war 
u.s. geographic history 
20th century presidential elections 
19th century authors 
presidents: born and died
u.s. presidents 
19th century americans 
20th century artists
diplomacy
on the old map 
historic figures
20th century american music 
20th century art 
19th century supreme court cases
ancient texts 
the 13 colonies 
history in the movies
history of medicine 
u.s. monuments
18th century americans
historic documents 
19th century americans 
the ancient world 
history
20th century authors
british army history
popes and history
historic namesakes
ancient greek philosophers
19th century presidential campaigns
business history
early u.s. history
radio history
historic places
presidential elections
19th century 

In [211]:
# Win percentage by Jeopardy category

categorized_df

Unnamed: 0,category,type,dad_win,ian_win,elizabeth_win,history,music,classical_music,geography,literature,...,words_phrases,politics,art,poetry,zoology,famous_americans,famous_people,opera,shakespeare,miscellaneous
0,islands,teen,1,1,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,social media,teen,1,1,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,famous americans,old,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,authors,old,0,0,0.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,german americans,old,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,awards and honors,regular,0,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
386,famous do's and don'ts,regular,1,0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
387,fictional languages,regular,0,0,0.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
388,19th century literature,regular,1,1,0.0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [212]:
win_percentages = categorized_df[['dad_win', 'ian_win', 'elizabeth_win']].mean().reset_index()
win_percentages.columns = ['scenario', 'overall']

for category in category_list:
    cat_win_percentages = categorized_df[categorized_df[category]==1][['dad_win', 'ian_win', 'elizabeth_win']].mean().reset_index()
    cat_win_percentages.columns = ['scenario', '{}'.format(category)]
    win_percentages = win_percentages.merge(cat_win_percentages, on='scenario')
    
win_percentages

Unnamed: 0,scenario,overall,history,music,classical_music,geography,literature,science,medicine,technology,...,words_phrases,politics,art,poetry,zoology,famous_americans,famous_people,opera,shakespeare,miscellaneous
0,dad_win,0.379487,0.353659,0.3,0.0,0.392405,0.253521,0.333333,0.0,0.5,...,0.533333,0.5,0.333333,0.0,0.666667,0.4,0.0,1.0,0.0,0.448276
1,ian_win,0.284615,0.329268,0.15,0.0,0.367089,0.183099,0.222222,0.0,0.5,...,0.4,0.4,0.5,0.0,0.333333,0.2,0.666667,1.0,0.5,0.293103
2,elizabeth_win,0.150649,0.111111,0.052632,0.0,0.168831,0.098592,0.222222,0.0,0.5,...,0.2,0.2,0.5,0.0,0.333333,0.133333,0.333333,1.0,0.0,0.12069


In [213]:
win_percentages_graphing = win_percentages.melt(id_vars=['scenario'], value_vars=category_list)
win_percentages_graphing.columns = ['scenario', 'category', 'frequency']
win_percentages_graphing

Unnamed: 0,scenario,category,frequency
0,dad_win,history,0.353659
1,ian_win,history,0.329268
2,elizabeth_win,history,0.111111
3,dad_win,music,0.300000
4,ian_win,music,0.150000
...,...,...,...
67,ian_win,shakespeare,0.500000
68,elizabeth_win,shakespeare,0.000000
69,dad_win,miscellaneous,0.448276
70,ian_win,miscellaneous,0.293103


In [214]:
px.bar(win_percentages_graphing, x='category', y='frequency', color='scenario', barmode='group')

In [215]:
counts_per_category = pd.DataFrame(categorized_df.sum()[5:]).reset_index()
counts_per_category.columns = ['category', 'appearances']
px.bar(counts_per_category, x='category', y='appearances')

In [216]:
win_percentages_type = categorized_df.groupby(['type']).mean().reset_index()
win_percentages_type_graphing = win_percentages_type.melt(id_vars=['type'], value_vars=['dad_win', 'ian_win', 'elizabeth_win'])
win_percentages_type_graphing.columns = ['scenario', 'category', 'frequency']
win_percentages_type_graphing

Unnamed: 0,scenario,category,frequency
0,champions,dad_win,0.272727
1,college,dad_win,0.714286
2,old,dad_win,0.5
3,regular,dad_win,0.364146
4,teen,dad_win,1.0
5,champions,ian_win,0.090909
6,college,ian_win,0.571429
7,old,ian_win,0.0
8,regular,ian_win,0.280112
9,teen,ian_win,1.0


In [217]:
px.bar(win_percentages_type_graphing, x='scenario', y='frequency', color='category', barmode='group')

In [251]:
# One-hot encode categorical categories, prepare data frame for modeling

modeling_df = pd.get_dummies(categorized_df, columns = ['type'])
dad_modeling_df = modeling_df.drop(columns = ['category', 'ian_win', 'elizabeth_win'])
ian_modeling_df = modeling_df.drop(columns = ['category', 'dad_win', 'elizabeth_win'])
elizabeth_modeling_df = modeling_df.drop(columns = ['category', 'ian_win', 'dad_win']).dropna()

In [294]:
# Split data into train and test sets

train_dad, test_dad = train_test_split(dad_modeling_df, test_size=.2)
train_ian, test_ian = train_test_split(ian_modeling_df, test_size=.2)
train_elizabeth, test_elizabeth = train_test_split(elizabeth_modeling_df, test_size=.2)

train_dad_x, test_dad_x = train_dad.drop(columns = ['dad_win']).to_numpy(), test_dad.drop(columns = ['dad_win']).to_numpy()
train_ian_x, test_ian_x = train_ian.drop(columns = ['ian_win']).to_numpy(), test_ian.drop(columns = ['ian_win']).to_numpy()
train_elizabeth_x, test_elizabeth_x = train_elizabeth.drop(columns = ['elizabeth_win']).to_numpy(), test_elizabeth.drop(columns = ['elizabeth_win']).to_numpy()

train_dad_y, test_dad_y = train_dad.dad_win, test_dad.dad_win
train_ian_y, test_ian_y = train_ian.ian_win, test_ian.ian_win
train_elizabeth_y, test_elizabeth_y = train_elizabeth.elizabeth_win, test_elizabeth.elizabeth_win
train_dad.columns

Index(['dad_win', 'history', 'music', 'classical_music', 'geography',
       'literature', 'science', 'medicine', 'technology', 'sports',
       'movies_tv', 'theater', 'religion', 'business_economics', 'mythology',
       'words_phrases', 'politics', 'art', 'poetry', 'zoology',
       'famous_americans', 'famous_people', 'opera', 'shakespeare',
       'miscellaneous', 'type_champions', 'type_college', 'type_old',
       'type_regular', 'type_teen'],
      dtype='object')

In [326]:
# Function to print accuracy scores

def print_accuracy(classifier, train_x, train_y, test_x, test_y):
    train_preds = classifier.predict(train_x)
    test_preds = classifier.predict(test_x)
    
    print('Training Accuracy: {}'.format(accuracy_score(train_preds, train_y)))
    print('Test Accuracy: {}'.format(accuracy_score(test_preds, test_y)))
    
# Funtion to predict probability of a new category

def get_prediction(classifier, category, type_name):
    
    pred_df = pd.DataFrame([[category, type_name]], columns=['category', 'type'])
    pred_df = categorize(pred_df)
    pred_df['miscellaneous'] = (pred_df.drop(
        columns=['type']).sum(axis=1)==0).apply(lambda x: 1 if x is True else 0)
    
    pred_df['champions'] = 0
    pred_df['college'] = 0
    pred_df['old'] = 0
    pred_df['regular'] = 0
    pred_df['teen'] = 0
    
    pred_df[type_name] = 1
    
    pred_df = pred_df.drop(columns=['category', 'type'])
    test_x = pred_df.to_numpy()
    return classifier.predict_proba(test_x)[:,1]
    
def get_prediction_all(dad_clf, ian_clf, elizabeth_clf, category, type_name):
    dad_win_prob = get_prediction(dad_clf, category, type_name)
    ian_win_prob = get_prediction(ian_clf, category, type_name)
    elizabeth_win_prob = get_prediction(elizabeth_clf, category, type_name)
    
    print('Dad win probability: {}'.format(dad_win_prob))
    print('Ian win probability: {}'.format(ian_win_prob))
    print('Elizabeth win probability: {}'.format(elizabeth_win_prob))

In [327]:
# Logistic Regression

dad_lr_clf = LogisticRegression()
dad_lr_clf.fit(train_dad_x, train_dad_y)

print('Dad Accuracy Scores:')
print_accuracy(dad_lr_clf, train_dad_x, train_dad_y, test_dad_x, test_dad_y)
print()

ian_lr_clf = LogisticRegression()
ian_lr_clf.fit(train_ian_x, train_ian_y)

print('Ian Accuracy Scores:')
print_accuracy(ian_lr_clf, train_ian_x, train_ian_y, test_ian_x, test_ian_y)
print()

elizabeth_lr_clf = LogisticRegression()
elizabeth_lr_clf.fit(train_elizabeth_x, train_elizabeth_y)

print('Elizabeth Accuracy Scores:')
print_accuracy(elizabeth_lr_clf, train_elizabeth_x, train_elizabeth_y, test_elizabeth_x, test_elizabeth_y)

Dad Accuracy Scores:
Training Accuracy: 0.7019230769230769
Test Accuracy: 0.6153846153846154

Ian Accuracy Scores:
Training Accuracy: 0.7307692307692307
Test Accuracy: 0.6666666666666666

Elizabeth Accuracy Scores:
Training Accuracy: 0.8409090909090909
Test Accuracy: 0.8831168831168831


In [328]:
# Test on new category

category = 'the 1950s'
game_type = 'regular'

get_prediction_all(dad_lr_clf, ian_lr_clf, elizabeth_lr_clf, category, game_type)

Dad win probability: [0.36918152]
Ian win probability: [0.29653945]
Elizabeth win probability: [0.13272406]
