In [197]:
import pandas as pd
import collections

df = pd.read_csv('example.csv')

In [212]:
df.columns = ['comedians', 'bios']
print(df.columns)
print(df.shape)
df = df.drop_duplicates()
print(df.shape)
df = df.dropna()
print(df.shape)

Index(['comedians', 'bios'], dtype='object')
(1081, 2)
(542, 2)
(491, 2)


In [213]:
def comic_clean(comic):
    comic = str(comic)
    comic = comic.split('/')[-1].replace('-',' ')
    return comic

In [214]:
df2 = df.copy()
df2['comedians'] = df2['comedians'].astype(str)
df2['comedians'] = df2['comedians'].apply(comic_clean)
df2.comedians

1                                         about lastnight
2                              adam carolla is unprepared
3                                     adam cayton holland
4                                                adam ray
5       adam ruins everything presents mind parasites ...
6                                             adrian mesa
7                                             aisha tyler
8                                               aj finney
9                                              al goodwin
10                                             al jackson
13                                           alex edelman
14                                            alex moffat
15                                               ali wong
16                                       alice wetterlund
17                                          alingon mitra
18                                            amy schumer
19                                            kevin smith
20            

In [215]:
#some cleaning up of comedian names and removing live podcasts and miscellaneous non-solo shows
df3 = df2.copy()
print('before scrub: ', df3.shape)
df3 = df3[~df3.comedians.str.contains(" is ")]
df3 = df3[~df3.comedians.str.contains(" Comedy")]
df3 = df3[~df3.comedians.str.contains(" Podcast")]
df3 = df3[~df3.comedians.str.contains(" live")]
df3 = df3[~df3.comedians.str.contains(" and ")]
df3 = df3[~df3.comedians.str.contains("about")]


df3['comedians'] = df3['comedians'].str.replace("paramount", "")
df3['comedians'] = df3['comedians'].str.replace("theatre", "")
df3.comedians = df3.comedians.apply(str.strip)
print('after scrub: ', df3.shape)

before scrub:  (491, 2)
after scrub:  (471, 2)


# let's investigate the bios finally!

In [216]:
def bio_clean(bio):    # takes in bio str and cleans it up by splitting by new lines then concatenates everything at end
    bio = bio.split('\n')
    del bio[0:2]
    try:
        bio.remove('Please review our ticket resale policy.')
    except:
        pass
    bio = list(filter(lambda a: a != '', bio))
    bio = " ".join(bio)
    return bio

In [217]:
df4 = df3.copy()
df4.bios = df4.bios.astype(str)
print('before cleaning: \n\n', df4.iloc[1,1])
df4.bios = df4.bios.apply(bio_clean)
print('after cleaning: \n\n', df4.iloc[1,1])

before cleaning: 

 
Adam Ray
Adam Ray has been very busy between his acting, writing and touring the country doing stand-up.  He was most recently seen on Arrested Development, HBO’s Curb Your Enthusiasm, the Netflix film Game Over Man, HBO’s Ballers, and season two of American Vandal for Netflix.  

On the big screen, Adam is known from his work opposite Sandra Bullock and Melissa McCarthy in Paul Feig's buddy cop comedy The Heat.  He was also featured in Paul Feig’s Spy, in the Jennifer Lopez comedy Second Act, as well as the reboot of Ghostbusters.  Adam was a cast member in the reboot of the sketch comedy franchise MADtv for the CW and played Joey McIntyre’s manager on the Pop Network show Return Of The Mac.  On the voiceover front, Adam has been the voice of CONAN for the past two years, has featured in cartoons like Trolls, Rick & Morty, Supermansion, and most recently in the Dreamworks show She-Ra on Netflix.  

Adam co-hosted the comedy game show Separation Anxiety on TBS with

# Let's do some exploratory analysis / modeling for once

In [218]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johnpapaioannou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/johnpapaioannou/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [219]:
tokens = nltk.word_tokenize(df4.iloc[0,1])
print(tokens)
print('\n\n parts of speech: ', nltk.pos_tag(tokens))
token_pos = nltk.pos_tag(tokens)
print(type(token_pos))
print(type(token_pos[0]))

['Join', 'Denver', "'s", 'Newest', 'Dad', 'This', 'Father', "'s", 'Day', 'at', 'Comedy', 'Works', 'Downtown', '!', 'Adam', 'Cayton-Holland', 'is', 'a', 'national', 'touring', 'headliner', 'who', 'was', 'named', 'one', 'of', '25', '“', 'Comics', 'to', 'Watch', '”', 'by', 'Esquire', 'Magazine', 'and', 'one', 'of', '“', '10', 'Comics', 'to', 'Watch', '”', 'by', 'Variety', '.', 'Along', 'with', 'his', 'co-hosts', 'in', 'the', 'The', 'Grawlix', 'comedy', 'troupe', ',', 'Adam', 'created', ',', 'writes', 'and', 'stars', 'in', 'truTV', '’', 's', 'Those', 'Who', 'Can', '’', 't', ',', 'in', 'which', 'Adam', 'plays', 'Spanish', 'teacher', 'and', 'bon', 'vivant', 'Loren', 'Payton', '.', 'He', 'has', 'appeared', 'on', 'Conan', ',', 'Comedy', 'Central', 'Presents', ',', 'The', 'Meltdown', 'with', 'Jonah', 'and', 'Kumail', ',', 'Happy', 'Endings', ',', 'Deadbeat', ',', 'Flophouse', ',', 'Hidden', 'America', ',', 'and', '@', 'midnight', '.', 'He', 'was', 'also', 'featured', 'on', 'the', 'Nerdist', ','

In [228]:
def pos_parser(blurb):
    pos_of_interest = ['JJ', 'JJR', 'JJS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS']
    tokens = nltk.word_tokenize(blurb)
    token_pos = nltk.pos_tag(tokens)

    pos_list = []
    for i in range(len(pos_of_interest)):
        pos_list.append([])
    
    i=0
    for pos in pos_of_interest:
        for token_tuple in token_pos:
            if token_tuple[1] == pos:
                pos_list[i].append(token_tuple[0])
        i+=1

    #clean up list, count duplicates? remove punctuation?
    return pos_list[0]

In [295]:
df5 = df4.copy()
df5.bios = df5.bios.apply(pos_parser)

In [312]:
sub = 'clean'
any(sub in mystring for mystring in df5.iloc[5,1])

True

In [313]:
def check_word(sub, str_list):
    result = any(sub in mystring for mystring in str_list)
    return result

In [314]:
df6 = df5.copy()
len(df6.iloc[:,0])
com_str = []
for i in range(len(df6.iloc[:,0])):
    if check_word('clean', df6.iloc[i,1]):
        com_str.append(df6.iloc[i,0])
print(com_str)

['al goodwin', 'henry cho', 'kermet apio', 'louie anderson', 'nate bargatze', 'rocky laporte', 'ryan hamilton']


In [319]:
import os
import pickle

cwd = os.getcwd()
file_path = cwd + '/cw.csv'
export_csv = df6.to_csv(file_path, index = None, header=True)
file_path = cwd + '/cw.pkl'
df6.to_pickle(file_path)

In [324]:
df_cw = pd.read_pickle(file_path)
com_str = []
for i in range(len(df_cw.iloc[:,0])):
    if check_word('clean', df_cw.iloc[i,1]):
        com_str.append(df_cw.iloc[i,0])
print(com_str)

['al goodwin', 'henry cho', 'kermet apio', 'louie anderson', 'nate bargatze', 'rocky laporte', 'ryan hamilton']


[]
