In [68]:
import os
import pandas as pd
import numpy as np
import pickle

In [69]:
# import dataframes and trim leftover features, engineer easy classification features from previous ones
rating = '/Users/johnpapaioannou/Desktop/insight/project/data/comedian_metrics.pkl'
joke_form = '/Users/johnpapaioannou/Desktop/insight/project/data/joke_form.pkl'
df_rating = pd.read_pickle(rating)
df_form = pd.read_pickle(joke_form)

In [70]:
print(df_rating.head())
print(df_form.head())

# Clean up a few comics' names
df_rating.iloc[20,0] = 'Cedrice the Entertainer'
df_rating.iloc[27,0] = 'D.L. Hughley'
df_rating.iloc[81,0] = 'Louis C.K.'
df_rating.iloc[11,0] = 'Big Jay Oakerson'

short_long_div = 100
df_form['joke_form'] = df_form.apply(lambda row: 0 if (row.sim_ht > short_long_div) else 1, axis=1)



       comedian  edginess  rating
0   Adam Devine  0.054392       3
1  Adam Sandler  0.068308       3
2    Adel Karam  0.008130       1
3   Al Madrigal  0.019988       2
4      Ali Wong  0.034306       2
       comedian                                         transcript  \
0   adam devine  \n[rock music playing]\n[indistinct chatter]\n...   
1  adam sandler  \n[man] Okay, ready, and… Take your own cue, A...   
2    adel karam  \nA NETFLIX COMEDY SPECIAL\nRecorded at the Ca...   
3   al madrigal  \n[dog barks] [FisherGreen’s Sisters Brothers ...   
4      ali wong  \nLadies and gentlemen, please welcome to the ...   

                                            sim_dist  sim_head  sim_tail  \
0  [0.5672938860919252, 0.057596792635744076, 0.0...  0.567294  0.005439   
1  [0.5400038091172934, 0.06472848023361816, 0.04...  0.540004  0.007653   
2  [0.6872416689717861, 0.08412475608946268, 0.05...  0.687242  0.012416   
3  [0.638331364596691, 0.06769669707868176, 0.037...  0.638331  0.00719

In [71]:
# aggregate important features and consolidate into one dataframe, (edginess score/binning, sim_ht / binning)
df_comic = df_rating.copy()
sim_ht = df_form.sim_ht
df_comic = df_comic.assign(sim_ht = df_form.sim_ht)
df_comic = df_comic.assign(form = df_form.joke_form)

In [85]:
# methods for binning edginess and sim_ht into content rating and joke form categories

# set PG, PG-13 and R limits

def edge_quant(score):
    pg_max = 0.009
    pg13_max = 0.02
    if score >= pg13_max:
        score_bin = 3
    if score < pg13_max and score > pg_max:
        score_bin = 2
    if score <= pg_max:
        score_bin = 1
    return score_bin

edge_bin = df_comic.edginess.map(edge_quant)
df_comic['rating'] = edge_bin

# sort/group comics according to edginess rating
df_comic.sort_values(by=['edginess'], inplace=True)


In [86]:
# see how many comics there are in every possibly combination of rating and form
print('clean/short')
print(df_comic[(df_comic['rating'] == 1) & (df_comic['form'] == 0)])

print("\nclean/long")
print(df_comic[(df_comic['rating'] == 1) & (df_comic['form'] == 1)])

print("\nmid/short")
print(df_comic[(df_comic['rating'] == 2) & (df_comic['form'] == 0)])

print("\nmid/long")
print(df_comic[(df_comic['rating'] == 2) & (df_comic['form'] == 1)])

print("\nedge/short")
print(df_comic[(df_comic['rating'] == 3) & (df_comic['form'] == 0)])

print("\nedge/long")
print(df_comic[(df_comic['rating'] == 3) & (df_comic['form'] == 1)])




clean/short
           comedian  edginess  rating       sim_ht  form
56    Henry Rollins  0.002819       1   102.388370     0
123  Volker Pispers  0.003150       1  6524.619536     0
118      Todd Barry  0.008049       1   101.100172     0
44     Enissa Amani  0.008055       1   118.082053     0
104      Ray Romano  0.008712       1   159.791747     0
83    Maria Bamford  0.008880       1   174.502546     0

clean/long
             comedian  edginess  rating     sim_ht  form
6     Anjelah Johnson  0.000425       1  55.049351     1
93      Nate Bargatze  0.000949       1  97.120238     1
48       Fred Armisen  0.001215       1  33.277902     1
18        Brian Regan  0.001236       1  69.151340     1
109    Rowan Atkinson  0.001379       1  43.785567     1
62       Jim Gaffigan  0.002332       1  40.433451     1
76        Kevin James  0.002568       1  72.773472     1
117       Stewart Lee  0.004158       1  61.695389     1
42    Ellen Degeneres  0.004561       1  61.595194     1
49   Ga

# export finalized comedian dataframe of list of comedians and their edginess/joke style values and which bins they belong to based on those values

In [87]:
data_path = '/Users/johnpapaioannou/Desktop/insight/project/data/'
file_path = data_path + 'comedians.pkl'
df_comic.to_pickle(file_path)


In [88]:
# test out routine to extract list of all comics for given rating/form pair

comics = df_comic[(df_comic.rating == 1) & (df_comic.form == 0)].comedian.tolist()
print(comics)

# test out routine to manipulate strings for URLS to youtube for hyperlinks

base_url = 'https://www.youtube.com/results?search_query='
filter_tag = '&sp=EgIYAQ%253D%253D'
comic1 = comics[0].replace(" ", '+')
print('comic1: ', comic1)
comics_url = [base_url + comic.replace(" ", '+') + '+stand' + '+up' + filter_tag for comic in comics]
print(comics_url)

['Henry Rollins', 'Volker Pispers', 'Todd Barry', 'Enissa Amani', 'Ray Romano', 'Maria Bamford']
comic1:  Henry+Rollins
['https://www.youtube.com/results?search_query=Henry+Rollins+stand+up&sp=EgIYAQ%253D%253D', 'https://www.youtube.com/results?search_query=Volker+Pispers+stand+up&sp=EgIYAQ%253D%253D', 'https://www.youtube.com/results?search_query=Todd+Barry+stand+up&sp=EgIYAQ%253D%253D', 'https://www.youtube.com/results?search_query=Enissa+Amani+stand+up&sp=EgIYAQ%253D%253D', 'https://www.youtube.com/results?search_query=Ray+Romano+stand+up&sp=EgIYAQ%253D%253D', 'https://www.youtube.com/results?search_query=Maria+Bamford+stand+up&sp=EgIYAQ%253D%253D']
