---
# Determine which episodes each Shark was present for
---

In [96]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests
import json

## Load the Shark Tank data

In [145]:
# https://www.quora.com/What-have-you-learned-from-watching-the-television-program-Shark-Tank/answer/Halle-Tecco
df = pd.read_csv('./data/SharkTankData.csv').dropna(subset=['Entrepreneur Gender'])
df.rename(columns = {'Entrepreneur Gender':'gender'}, inplace = True)
df = df[df.gender != 'Mixed Team']

In [146]:
# Add columns to dataframe to record Shark attendance
shark_names = {'Mark': 'Cuban', 'Daymond': 'John', 'Barbara': 'Corcoran', 'Robert': 'Herjavec', 'Kevin': "O'Leary", 'Lori': 'Greiner'}
# shark_names = dict(zip(shark_names.values(), shark_names.keys()))
for val in shark_names.values():
    df[val+'_present'] = pd.Series(np.zeros(df.shape[0]), index=df.index)

## Scrape the Wikipedia page

In [27]:
# Get the HTML for the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_Shark_Tank_episodes'
page = requests.get(url)
html = bs(page.text, 'html.parser')

In [46]:
# Get the season descriptions
seasons = html.find_all('table', class_="wikitable plainrowheaders wikiepisodetable")

In [161]:
shark_names['John']

'Daymond'

In [177]:
shark_names

{'Mark': 'Cuban',
 'Daymond': 'John',
 'Barbara': 'Corcoran',
 'Robert': 'Herjavec',
 'Kevin': "O'Leary",
 'Lori': 'Greiner'}

In [178]:
sharks

['Mark', 'Barbara', 'Kevin', 'Lori', 'Robert']

In [175]:
for num_season,season in enumerate(seasons[:5]):
    print("-"*30)
    print("Season: {}".format(num_season+1))
    episodes = season.find_all('td', class_='description')
    for num_episode,episode in enumerate(episodes):
        # Determine which Sharks were on the episode
        if num_season==0:
            sharks = ['Daymond','Kevin','Barbara','Robert']
        else:
            sharks = episode.text.split('Sharks: ')[1].split('\n')[0].split(', ')
            sharks = [shark.split(' ')[0] for shark in sharks]
            
        # Update Shark attendance rates in the dataframe
        mask_episode = (df.Season==num_season+1) & (df['No. in series']==num_episode+1)
        for shark in sharks:
            if shark in shark_names:
                df.loc[mask_episode, shark_names[shark]+'_present'] = 1

------------------------------
Season: 1
------------------------------
Season: 2
------------------------------
Season: 3
------------------------------
Season: 4
------------------------------
Season: 5


In [179]:
df.to_csv('SharkTankData_WithAttendance.csv')

In [92]:
# First and last names of each shark
shark_counts = dict(zip(shark_names.keys(),[0]*len(shark_names)))

# Count the number of times each shark appeared on the show
episode_count = 0
for n,season in enumerate(seasons[:5]):
    print("-"*30)
    print("Season #{n}".format(n=n+1))
    rows = season.find_all('tr', class_='vevent')
    episodes = season.find_all('td', class_='description')
    for row,episode in zip(rows,episodes):
        episode_count += 1
#         print("Episode: {}".format(row.text.splitlines()[1]))
        if n==0:
            sharks = ['Daymond',"O'Leary",'Corcoran','Herjavec']
        else:
            sharks = episode.text.split('Sharks: ')[1].split('\n')[0].split(', ')
        for shark in sharks:
            if shark in shark_counts:
                shark_counts[shark] += 1

------------------------------
Season #1
------------------------------
Season #2
------------------------------
Season #3
------------------------------
Season #4
------------------------------
Season #5


In [93]:
episode_count

93

In [94]:
shark_counts

{'Mark': 73,
 'Daymond': 80,
 'Barbara': 51,
 'Robert': 76,
 'Kevin': 73,
 'Lori': 41}

In [48]:
episode.text.split('Sharks: ')[1].split('\n')[0].split(', ')

['Mark', 'Daymond', 'Kevin', 'Barbara', 'Robert']