In [1]:
import re
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [197]:
BASE_URL = "http://www.skatevideosite.com/index.php?page=skatevideos&sort=rating&p={}"
dict_keys = ['company', 'filmmaker', 'year', 'country']

In [208]:
def load_soup(base_url):
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

def get_video_urls(soup):
    skate_videos = soup.find_all('table', {"id": "skatevideos"})
    trs = skate_videos[0].find_all('td')
    tr_links = [tr.find_all('a') for tr in trs]
    videos_info = []
    for tr in tr_links:
        try:
            videos_info.append(tr[0].find_all('a', href=True))
        except: 
            continue
    video_urls = []
    for vid in videos_info:
        try:
            video_urls.append(vid[0]['href'])
        except:
            continue
    return video_urls

def get_skaters(soup):
    skater_links = soup.find_all('div', {"id": "skaterlist"})[0].find_all('a')
    return [a.text for a in skater_links]

# skater, video, company, song, length, year, 
def get_video_info(soup):
    video_dict = {k:[] for k in dict_keys}
    for a in soup.find_all('table', class_="videoinfo")[0].find_all('a'):
        if "colspan" in a.attrs:
            continue
        else:
            href = a['href']
            if 'companies' in href:
                video_dict['company'] = a.text
            elif 'filmmakers' in href:
                video_dict['filmmaker'] = a.text
            elif 'year' in href:
                video_dict['year'] = a.text
            elif 'countries' in href:
                video_dict['country'] = a.text
    video_df = pd.DataFrame(list(video_dict.items())).set_index(0).T
    return video_df

def get_title(soup):
    return soup.find_all('div', class_='twelve columns')[0].find('h1').text

def make_video_df(soup):
    lst_col = 'skater'
    skaters = get_skaters(soup)
    df = get_video_info(soup)
    video_title = get_title(soup)
    df['title'] = video_title
    df[lst_col] = [skaters]
    video_df = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    return video_df

def make_videos_info(video_urls):
    videos_info = []
    for vid in video_urls:
        print(vid)
        full_vid_url = SITE_BASE_URL.format(video=vid)
        video_soup = load_soup(full_vid_url)
        try:
            video_df = make_video_df(video_soup)
            videos_info.append(video_df)
        except:
            continue
    return videos_info

def scrape_page(url):
    soup = load_soup(url)
    video_urls = get_video_urls(soup)
    videos_info = make_videos_info(video_urls)
    all_videos_df = pd.concat(videos_info)
    return all_videos_df

def scrape_all_videos(page_start, page_end):    
    all_video_dfs = []
    for i in range(page_start, page_end):
        print('Scraping page :', i)
        print()
        current_page = BASE_URL.format(i)
        page_scraped_df = scrape_page(current_page)
        all_video_dfs.append(page_scraped_df)
    all_video_dfs = pd.concat(all_video_dfs)
    return all_video_dfs

In [209]:
# scrape info for all videos on pages 1-18 (i.e. every video)
all_videos_df = scrape_all_videos(1, 19)

Scraping page : 1

/skatevideos/destroying-babylon/soundtrack#online
/skatevideos/loony-bin/soundtrack#online
/skatevideos/atlantic-drift/soundtrack#videoparts
/skatevideos/sour-the-sour-solution-ii/soundtrack#online
/skatevideos/europe-co-autobahn/soundtrack#online
/skatevideos/the-watched/soundtrack#videoparts
/skatevideos/pyramid-country-vessel-in-passing/soundtrack#online
/skatevideos/polar-we-blew-it-at-some-point/soundtrack#online
/skatevideos/palace-palasonic/soundtrack#online
/skatevideos/polar-i-like-it-here-inside-my-mind-dont-wake-me-this-time/soundtrack#online
/skatevideos/lurknyc-strangers/soundtrack#online
/skatevideos/2bananer-og-1snickers/soundtrack#online
/skatevideos/double-dip/soundtrack#online
/skatevideos/paramount/soundtrack#online
/skatevideos/the-917-video/online
/skatevideos/lakai-fully-flared/soundtrack#online
/skatevideos/boyish/soundtrack#online
/skatevideos/cee--lo/soundtrack#online
/skatevideos/meet-the-stans/soundtrack#online
/skatevideos/gx1000/soundtrac

/skatevideos/creature-hesh-law/soundtrack#online
Scraping page : 3

/skatevideos/union-soyuz-11/soundtrack#online
/skatevideos/bronze-56k/soundtrack#online
/skatevideos/plan-b-torey-pudwills-big-bang/soundtrack#online
/skatevideos/mystery-color-theory/soundtrack#videoparts
/skatevideos/le-theatrix/soundtrack#videoparts
/skatevideos/de-honderd-video/soundtrack#online
/skatevideos/dimestore-the-dime-video/soundtrack#online
/skatevideos/newsoul-you-got-soul/soundtrack#online
/skatevideos/farsh/soundtrack#online
/skatevideos/color/soundtrack#online
/skatevideos/tha-mustard-connection/soundtrack#videoparts
/skatevideos/kfc-television/soundtrack#videoparts
/skatevideos/broke-am/soundtrack#online
/skatevideos/uso-publico-video-1/soundtrack#online
/skatevideos/creature-csfu/soundtrack#online
/skatevideos/ill-conceived/soundtrack#videoparts
/skatevideos/bath-salts/soundtrack#videoparts
/skatevideos/alltimers-no-idea/soundtrack#online
/skatevideos/bronze-its-time/soundtrack#online
/skatevideos/r

/skatevideos/sf-hill-street-blues-2/soundtrack#online
/skatevideos/world-industries-rubbish-heap/soundtrack#online
/skatevideos/nashdah/online
/skatevideos/labor-of-love/soundtrack#videoparts
/skatevideos/north-coast/soundtrack#online
/skatevideos/tildeth/soundtrack#online
/skatevideos/weird-dimension/soundtrack#online
/skatevideos/blueprint-lost-and-found/soundtrack#online
/skatevideos/flip-really-sorry/soundtrack#online
/skatevideos/pitcrew-where-im-from/soundtrack#videoparts
/skatevideos/10000-kilometers/soundtrack#online
/skatevideos/jobs-never/soundtrack#online
/skatevideos/btl-4/soundtrack#online
/skatevideos/thrasher-king-of-the-road-2006/soundtrack#online
/skatevideos/toy-machine-good-and-evil/soundtrack#online
/skatevideos/you-know-good-man-3/soundtrack#online
/skatevideos/sos-crew-save-our-souls/soundtrack#online
/skatevideos/shades-away/soundtrack#videoparts
/skatevideos/chickenbone-nowison/soundtrack#online
/skatevideos/girl-anti-hero-beauty-and-the-beast-northwest-tour/sou

/skatevideos/bolts-and-buttah/soundtrack#online
/skatevideos/meatpauls/soundtrack#online
/skatevideos/boys-of-summer-2/soundtrack#online
/skatevideos/303-let-the-good-times-roll/soundtrack#online
/skatevideos/dc-europe-where-eu-at/soundtrack#online
/skatevideos/pig-slaughterhouse/soundtrack#online
/skatevideos/transworld-feedback/soundtrack#online
/skatevideos/delight-the-kacki-video/soundtrack#online
/skatevideos/osiris-capital-tour/soundtrack#online
/skatevideos/streetmarket-youth/soundtrack#online
/skatevideos/ogres-and-milk-2-back-with-wax/soundtrack#online
/skatevideos/ride-everywhere/online
/skatevideos/jakes-junt/soundtrack#online
/skatevideos/volcom--europe-summer-tour/soundtrack#online
/skatevideos/street-cookin/soundtrack#online
/skatevideos/birdhouse-the-beginning/soundtrack#online
/skatevideos/still-rollin/soundtrack#online
/skatevideos/sketchbook/soundtrack#online
/skatevideos/toy-machine-jump-off-a-building/soundtrack#online
/skatevideos/sk8mafia-best-of-2010/online
/skat

/skatevideos/lakai-am-i-am/soundtrack#online
/skatevideos/411vm-hot-dogs-on-wheels/soundtrack#online
/skatevideos/maple-black-cat/soundtrack#videoparts
/skatevideos/the-berrics-recruits/soundtrack#online
/skatevideos/nike-sb-two-up/soundtrack#online
/skatevideos/element-bams-or-bust/soundtrack#online
/skatevideos/sabotage2/soundtrack#videoparts
/skatevideos/darkstar-forward-slash/soundtrack#online
/skatevideos/411vm-europe-2002/soundtrack#online
/skatevideos/shapes-and-shadows/soundtrack#online
/skatevideos/dc-european-collective-tour/soundtrack#online
/skatevideos/haven-hip-to-the-jive/soundtrack#videoparts
/skatevideos/exit-real-world-scope-this-too/soundtrack#videoparts
/skatevideos/luvideo-mag-nozebodynose/soundtrack#online
/skatevideos/habitat-continental-caravan/soundtrack#online
/skatevideos/katslauw/soundtrack#online
/skatevideos/supreme-a-love-supreme/soundtrack#online
/skatevideos/iou/soundtrack#online
/skatevideos/plan-b-vamdalism/soundtrack#online
/skatevideos/ekta-telecine

/skatevideos/thrasher-soty-naawwsty/soundtrack#online
/skatevideos/baker-certi-fried-pro-rowan-zorilla/soundtrack#online
/skatevideos/adidas-korean-dance/soundtrack#online
/skatevideos/vicious-cycle/soundtrack#online
/skatevideos/stussy-it-aint-where-ya-from-its-where-ya-at/soundtrack#online
/skatevideos/es-europe-tour-2003/soundtrack#videoparts
/skatevideos/closure/soundtrack#online
/skatevideos/birdhouse-tour-du-monde-2010/soundtrack#online
/skatevideos/pitcrew-feels-like-the-first-time/soundtrack#videoparts
/skatevideos/hot-wax/soundtrack#videoparts
/skatevideos/411vm-australian-vacation/soundtrack#online
/skatevideos/411vm-volume-14-issue-2/soundtrack#online
/skatevideos/the-coast/soundtrack#online
/skatevideos/east-coast/soundtrack#online
/skatevideos/get-familiar/soundtrack#online
/skatevideos/trouble-shooters/soundtrack#online
/skatevideos/cut-outs-of-florida/soundtrack#online
/skatevideos/puzzle-video-march-april-2008/online
/skatevideos/kayo-keho-canada-tour/soundtrack#online


/skatevideos/on-video-spring-2002/soundtrack#online
/skatevideos/hello21-issue-6/soundtrack#online
/skatevideos/101-wwii-report-promo/soundtrack#online
/skatevideos/red-bull-seek-and-destroy-2/soundtrack#online
/skatevideos/chocolate-se-habla-canuck/soundtrack#online
/skatevideos/plan-b-in-dominican-republic/soundtrack#videoparts
/skatevideos/dgk-saved/soundtrack#online
/skatevideos/world-industries-new-world-order/soundtrack#online
/skatevideos/sheep-life-of-leisure/soundtrack#online
/skatevideos/expedition-one-video-out-soon/soundtrack#online
/skatevideos/h-street-lick/soundtrack#online
/skatevideos/rolkafilm-kriza-3/soundtrack#online
/skatevideos/street-survival/soundtrack#online
/skatevideos/emerica-brandon-westgate-new-shoe-new-part/soundtrack#online
Scraping page : 16

/skatevideos/411vm-brazilian-vacation/soundtrack#online
/skatevideos/es-tri-x-northwest-trip/soundtrack#online
/skatevideos/puzzle-video-summer-2006/soundtrack#online
/skatevideos/16-below/soundtrack#online
/skatev

In [210]:
all_videos_df

Unnamed: 0,company,filmmaker,year,country,title,skater
0,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Adrian Mallory
1,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Alex Fatemi
2,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Alex Turan
3,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Andrew Wenckstern
4,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Antonio Devitt
5,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Ben Paulsrud
6,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Brandon Dwyer
7,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Brendan Cahill
8,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Chad Wilson
9,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Chris Colbourn


In [214]:
all_videos_df.columns

Index(['company', 'filmmaker', 'year', 'country', 'title', 'skater'], dtype='object')

In [251]:
def resolve_name(name):
    if name == []:
        return ''
    return name

all_videos_df['company'] = all_videos_df.company.apply(lambda x: resolve_name(x))
all_videos_df['year'] = all_videos_df.year.apply(lambda x: resolve_name(x))
all_videos_df['title'] = all_videos_df.title.apply(lambda x: resolve_name(x))
all_videos_df['filmmaker'] = all_videos_df.filmmaker.apply(lambda x: resolve_name(x))
all_videos_df['country'] = all_videos_df.country.apply(lambda x: resolve_name(x))

In [252]:
# sort videos descending
all_videos_df.sort_values(by = ['company', 'year', 'title'], ascending=False, inplace=True)

In [256]:
all_videos_dfs.to_csv('skaters_and_videos.csv', index=False)

In [257]:
all_video_dfs

Unnamed: 0,company,filmmaker,year,country,title,skater
0,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Adrian Mallory
1,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Alex Fatemi
2,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Alex Turan
3,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Andrew Wenckstern
4,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Antonio Devitt
5,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Ben Paulsrud
6,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Brandon Dwyer
7,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Brendan Cahill
8,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Chad Wilson
9,[],Travis Knapp-Prasek,2010,United States,Destroying Babylon,Chris Colbourn
