In [1]:
import pandas as pd
import logging
from concurrent.futures import ThreadPoolExecutor
FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)
import json

### Scraping data

In [2]:
import get_wikipedia_links

link_scrape = get_wikipedia_links.get_wiki_links()
link_scrape.fit(2005,2019)

2019-05-25 16:57:21,296 Getting wikipedia links...
2019-05-25 16:57:40,163 Clean wikipedia URLs...
2019-05-25 16:57:40,172 Setting up dataframes...
2019-05-25 16:57:40,210 Complete!


In [2]:
df_wiki = pd.read_csv('data/wikipedia_links.csv')
df_wiki = df_wiki.drop_duplicates()

In [4]:
# cache a pickled file for results, run a crawler with multiple workers
def crawl_raw_data(workers, function, urls):
    with ThreadPoolExecutor(max_workers=workers) as executor:       
        return executor.map(function, urls)

In [5]:
from wikipedia import get_wiki_links, get_wiki_pageviews

wiki_links = []
wiki_pageviews = []
for url, title in zip(df_wiki.wiki_url, df_wiki.clean_wiki_url):
    wiki_links.append(get_wiki_links(url))
    wiki_pageviews.append(get_wiki_pageviews(title))

df_wiki_links = pd.DataFrame(wiki_links).set_index(['wiki_url'])
df_wiki_pageviews = pd.concat([
    pd.DataFrame.from_dict(title_dict,orient='index') 
    for title_dict in wiki_pageviews], axis=1, sort=False)

df_wiki_pageviews.to_csv('data/wiki_pageviews_data.csv', sep=',',index=True)
df_wiki_links.to_csv('data/wiki_links_data.csv', sep=',', index=True)

ERROR while fetching and parsing ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Dirty_Sanchez_%28TV_series%29%23Dirty_Sanchez%3A_The_Movie/daily/2013010100/2019052600']


Traceback (most recent call last):
  File "c:\users\kylem\appdata\local\programs\python\python37\lib\site-packages\mwviews\api\pageviews.py", line 145, in article_views
    'The pageview API returned nothing useful at: {}'.format(urls)
Exception: The pageview API returned nothing useful at: ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Dirty_Sanchez_%28TV_series%29%23Dirty_Sanchez%3A_The_Movie/daily/2013010100/2019052600']


ERROR while fetching and parsing ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Tony_n%27_Tina%27s_Wedding%23Film_adaptation/daily/2013010100/2019052600']


Traceback (most recent call last):
  File "c:\users\kylem\appdata\local\programs\python\python37\lib\site-packages\mwviews\api\pageviews.py", line 145, in article_views
    'The pageview API returned nothing useful at: {}'.format(urls)
Exception: The pageview API returned nothing useful at: ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Tony_n%27_Tina%27s_Wedding%23Film_adaptation/daily/2013010100/2019052600']


ERROR while fetching and parsing ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/No%25C3%25ABlle_%282007_film%29/daily/2013010100/2019052600']


Traceback (most recent call last):
  File "c:\users\kylem\appdata\local\programs\python\python37\lib\site-packages\mwviews\api\pageviews.py", line 145, in article_views
    'The pageview API returned nothing useful at: {}'.format(urls)
Exception: The pageview API returned nothing useful at: ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/No%25C3%25ABlle_%282007_film%29/daily/2013010100/2019052600']


ERROR while fetching and parsing ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/The_World_Made_Straight%23Film_adaptation/daily/2013010100/2019052600']


Traceback (most recent call last):
  File "c:\users\kylem\appdata\local\programs\python\python37\lib\site-packages\mwviews\api\pageviews.py", line 145, in article_views
    'The pageview API returned nothing useful at: {}'.format(urls)
Exception: The pageview API returned nothing useful at: ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/The_World_Made_Straight%23Film_adaptation/daily/2013010100/2019052600']


In [6]:
from bomojo import get_data, clean_df

df_wiki_links_dropna = df_wiki_links.dropna(subset=['bomojo_link'])
df_bomojo = pd.DataFrame(crawl_raw_data(5, get_data, df_wiki_links_dropna.bomojo_link))
df_bomojo.index = df_wiki_links_dropna.index

df_bomojo = clean_df(df_bomojo)
df_bomojo.to_csv('data/bomojo_data.csv',sep=',',index=True)

In [7]:
from rottentomatoes import get_score

df_wiki_links_dropna = df_wiki_links.dropna(subset=['rottentomatoes_link'])
df_rt = pd.DataFrame(crawl_raw_data(10, get_score, df_wiki_links_dropna.rottentomatoes_link))
df_rt.index = df_wiki_links_dropna.index

df_rt.to_csv('data/rottentomatoes_data.csv',sep=',',index=True)

In [8]:
from metacritics import get_score

df_wiki_links_dropna = df_wiki_links.dropna(subset=['metacritic_link'])
df_mc = pd.DataFrame(crawl_raw_data(10, get_score, df_wiki_links_dropna.metacritic_link))
df_mc.index = df_wiki_links_dropna.index
    
df_mc.to_csv('data/metacritic_data.csv',sep=',',index=True)

In [9]:
from themoviedb import run_tmdb, json_dump

df_wiki_links_dropna = df_wiki_links.dropna(subset=['imdb_link'])
df_wiki_links_dropna['imdb_link'] = df_wiki_links_dropna.imdb_link.apply(lambda x: x.split('title/')[1].replace('/',''))
df_tmdb = pd.DataFrame(crawl_raw_data(5, run_tmdb, df_wiki_links_dropna.imdb_link))
df_tmdb.index = df_wiki_links_dropna.index

json_dump(df_tmdb,'genres','credits','keywords','production_companies','production_countries','video_stats')
df_tmdb.to_csv('data/themoviedb_data.csv', sep=',',index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


### Comments