# Scraping My Anime List

In [1]:
import pandas as pd
import xmltodict
import json
import requests
import xml.etree.ElementTree as ET
import urllib.parse
from xml.sax.saxutils import unescape
from time import sleep
import re

In [391]:
CONFIG = {}

with open('mal_config.txt', "r") as in_file:
    for line in in_file:
        line = line.split(":")
        parameter = line[0].strip()
        value = line[1].strip()
        CONFIG[parameter] = value

In [392]:
sesh = requests.Session()
sesh.auth = (CONFIG['username'], CONFIG['password'])

In [393]:
def html_decode(s):
    """
    Returns the ASCII decoded version of the given HTML string. This does
    NOT remove normal HTML tags like <p>.
    """
    htmlCodes = (
            ("'", '&#39;'),
            ("'", '&#039;'),
            ("'", '&rsquo;'),
            ("—", '&mdash;'),
            ("é", '&eacute;'),
            ('"', '&quot;'),
            ('>', '&gt;'),
            ('<', '&lt;'),
            ('&', '&amp;')
        )
    for code in htmlCodes:
        s = s.replace(code[1], code[0], 1000000)
    return s

In [394]:
def readPage(session, search_page_url):
    response = session.get(search_page_url)
    #print(response.text)
    page = response.text
    try:
        x = xmltodict.parse(page)
    except:
        return None
    anime_entries = x['anime']['entry']
    num_entries_list = list(range(len(anime_entries)))
    j = json.dumps(anime_entries) 
    
    #return j
    try:
        df = pd.read_json(j, orient='columns')

    except ValueError:
        df = pd.read_json(j, typ='series', orient='columns')
        df = pd.DataFrame(df).transpose()
    try:
        df.synopsis = df.synopsis.apply(lambda x: html_decode(x))
    except:
        pass
    
    return df

In [395]:
test_url = 'http://myanimelist.net/api/anime/search.xml?q=bleach'

In [396]:
#readPage(sesh, test_url)

In [397]:
prescraped_names = pd.read_csv('shows_with_ids.csv')
prescraped_names.set_index('id', inplace=True)

In [398]:
prescraped_names.head()

Unnamed: 0_level_0,name,rating,rank,popularity,members,favorites
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5114,Fullmetal Alchemist: Brotherhood,9.24,1,6,384629,39327
9253,Steins;Gate,9.18,2,16,304291,33112
4181,Clannad: After Story,9.15,3,31,258088,26773
11061,Hunter x Hunter (2011),9.14,4,127,134742,12585
9969,Gintama,9.14,5,362,71698,2306


In [399]:
url_head = 'http://myanimelist.net/api/anime/search.xml?q='
full_df = pd.DataFrame()

for show in list(prescraped_names.name)[:]:
    formatted_title = urllib.parse.quote(show)
    page_data = readPage(sesh,url_head + formatted_title)
    try:
        page_data = page_data.set_index('id')
    except:
        continue
        
    full_df = pd.concat([full_df, page_data])
        
    sleep(.1)

In [414]:
full_df.reset_index(inplace=True)

In [415]:
full_df.drop_duplicates(subset='id', inplace=True)

In [426]:
full_df.head(3)

Unnamed: 0_level_0,index,end_date,english,episodes,image,score,start_date,status,synonyms,synopsis,title,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,1999-04-24,Cowboy Bebop,26,http://cdn.myanimelist.net/images/anime/4/1964...,8.83,1998-04-03,Finished Airing,,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,TV
5,1,2001-09-01,Cowboy Bebop: Knockin' on Heaven's Door,1,http://cdn.myanimelist.net/images/anime/6/1433...,8.41,2001-09-01,Finished Airing,Cowboy Bebop Movie,"As the Cowboy Bebop crew travels the stars, th...",Cowboy Bebop: Tengoku no Tobira,Movie
6,3,1998-09-30,Trigun,26,http://cdn.myanimelist.net/images/anime/7/2031...,8.32,1998-04-01,Finished Airing,,Vash the Stampede is a wanted man with a habit...,Trigun,TV


In [417]:
full_df.reset_index(inplace=True)
prescraped_names.reset_index(inplace=True)
full_df.id = full_df.id.astype('int')
full_df = full_df.set_index('id')
prescraped_names = prescraped_names.set_index('id')
full_df.sort_index(inplace=True)
prescraped_names.sort_index(inplace=True)
#full_df.drop('index', axis=1, inplace=True)

In [419]:
merged_data = pd.concat([full_df, prescraped_names], axis = 1, join='inner')

In [420]:
merged_data = merged_data.dropna(subset=['image'])

In [421]:
merged_data.reset_index(inplace=True)

In [422]:
merged_data.head(3)

Unnamed: 0,id,index,end_date,english,episodes,image,score,start_date,status,synonyms,synopsis,title,type,name,rating,rank,popularity,members,favorites
0,1,0,1999-04-24,Cowboy Bebop,26,http://cdn.myanimelist.net/images/anime/4/1964...,8.83,1998-04-03,Finished Airing,,"In the year 2071, humanity has colonized sever...",Cowboy Bebop,TV,Cowboy Bebop,8.83,20,27,266713,17715
1,5,1,2001-09-01,Cowboy Bebop: Knockin' on Heaven's Door,1,http://cdn.myanimelist.net/images/anime/6/1433...,8.41,2001-09-01,Finished Airing,Cowboy Bebop Movie,"As the Cowboy Bebop crew travels the stars, th...",Cowboy Bebop: Tengoku no Tobira,Movie,Cowboy Bebop: Tengoku no Tobira,8.41,145,257,91771,374
2,6,3,1998-09-30,Trigun,26,http://cdn.myanimelist.net/images/anime/7/2031...,8.32,1998-04-01,Finished Airing,,Vash the Stampede is a wanted man with a habit...,Trigun,TV,Trigun,8.34,191,72,175287,5575


In [444]:
wanted_data = merged_data[['title',
                           'id',
                           'episodes',
                           'score',
                           'rank',
                           'popularity',
                           'members',
                           'favorites',
                           'start_date',
                           'end_date',
                           'status',
                           'type',
                           'synopsis']]

In [475]:
wanted_data.synopsis = wanted_data.synopsis.astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [476]:
wanted_data.head(3)

Unnamed: 0,title,id,episodes,score,rank,popularity,members,favorites,start_date,end_date,status,type,synopsis
0,Cowboy Bebop,1,26,8.83,20,27,266713,17715,1998-04-03,1999-04-24,Finished Airing,TV,"In the year 2071, humanity has colonized sever..."
1,Cowboy Bebop: Tengoku no Tobira,5,1,8.41,145,257,91771,374,2001-09-01,2001-09-01,Finished Airing,Movie,"As the Cowboy Bebop crew travels the stars, th..."
2,Trigun,6,26,8.32,191,72,175287,5575,1998-04-01,1998-09-30,Finished Airing,TV,Vash the Stampede is a wanted man with a habit...


In [477]:
wanted_data.to_csv('full_anime_data_set', index=False)

## Cleaning up wanted data

In [478]:
synops = list(wanted_data.synopsis)

In [501]:
#synops

In [493]:
for i, item in enumerate(synops):
    if item == None:
        continue
    else:
        if type(item) is float:
            print(item)
        item = re.sub('\<.*?\>', '', item)
        item = re.sub('\[.*?\]', '', item)
        item = re.sub('\(.*?\)', '', item)
        item = re.sub('\{.*?\}', '', item)
        item = ' '.join(item.split())
        item = item.replace('\\', '', 100)
        synops[i] = item
    


In [509]:
wanted_data.synopsis = synops

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [510]:
wanted_data.to_csv('full_anime_data_set.csv', index=False)