In [130]:
!pip install selenium

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import numpy as np
import time

# get a reference to the download menu. This will run before the page has 
# finished loading, so we stick it in a while loop and just keep looping
# until we're successful.
def find_by_id(element_id):
    while True:
        try:
            target = driver.find_element_by_id(element_id)
        except NoSuchElementException:
            time.sleep(0.2)
            continue
        else:
            break
    return target

def find_by_class(element_class):
    while True:
        try:
            target = driver.find_element_by_class_name(element_class)
        except NoSuchElementException:
            time.sleep(0.2)
            continue
        else:
            break
    return target

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless') -- using headless mode currently leads to NoSuchElementException 
driver = webdriver.Chrome('./chromedriver', options=options)



In [131]:
def page_scrape(df, link, net_df):
    
    
    malrow = {}
    
    ### MAIN PAGE ###

    
    #driver.get(link)  
    
    #0. Title
    wrapper_region = find_by_class("wrapper")
    title_region = wrapper_region.find_element_by_tag_name("h1")
    malrow["Title"] = title_region.text
    
    # Hyperlink
    malrow['Hyperlink'] = '=HYPERLINK("%s", "%s")' % (link.format(malrow["Title"]), malrow["Title"])

    # 1. Information Left Side
    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_element_by_class_name("borderClass")
    sty = bord_region.find_element_by_tag_name('div')
    rows = sty.find_elements_by_tag_name('div')

    def duration(st):
        minutes = 0
        if "sec" in st:
            minutes = int(st.split(" sec")[0]) / 60
            return minutes
        elif ("hr" in st) and ("min" in st):
            minutes = int(st.split(" hr")[0]) * 60
            rest = st.split(" hr")[1][2:]
            return int(rest.split(" min")[0]) + minutes
        elif "hr" in st:
            return int(st.split(" hr")[0]) * 60
        elif st == "Unknown":
            return None
        elif "min" in st:
            return int(st.split(" min")[0]) + minutes

    
    def raw2int(index, cutoff):
        if "A" in (rows[-(index)].text[cutoff:].replace(',', '')):
            return None
        return int(rows[-(index)].text[cutoff:].replace(',', ''))
    
    while (rows[-1].text[:10] != 'Favorites:'):
        rows.remove(rows[-1])

    malrow["Source"] = rows[-11].text[8:]
    malrow["Genres"] = rows[-10].text.split(": ")[1:][0].split(", ")
    malrow["Duration"] = duration(rows[-9].text[10:])
    
    if ("A" in rows[-7].text[7:]):
        malrow["ScoredCount"] = None
        malrow["Score"] = None
    else:
        malrow["ScoredCount"] = int(rows[-7].text[7:].split(" (")[1].split("by ")[1].split(" users")[0].replace(',', ''))
        malrow["Score"] = float(rows[-7].text[7:].split(" (")[0])
    
    malrow["Ranked"] = raw2int(5, 9)
    malrow["Popularity"] = raw2int(3, 13)
    malrow["Members"] = raw2int(2, 9)
    malrow["Favorites"] = raw2int(1, 10)
    
    for row in rows:
        tag = row.text.split(":", 1)[0]
        try:
            content = row.text.split(": ", 1)[1]
        except:
            content = ""

        if tag == "Episodes":
            malrow["Episodes"] = content
            if "Unknown" not in malrow["Episodes"]:
                malrow["Episodes"] = int(malrow["Episodes"])
        elif tag == "Type":
            malrow["Type"] = content
        elif tag == "Premiered":
            premier_split = content.split(" ")
            malrow["Premiered"] = content
            try:
                malrow["Premiered Year"] = premier_split[1]    
                malrow["Premiered Season"] = premier_split[0]
            except:
                pass
            
        elif tag == "Aired":
            aired_split = content.split(" to ")
            malrow["Aired"] = content
            malrow["Start Year"] = aired_split[0][-4:]
            try:
                malrow["End Year"] = aired_split[1][-4:]
            except:
                malrow["End Year"] = malrow["Start Year"]
        elif tag == "Broadcast":
            malrow["Broadcast"] = content
        elif tag == "Studios":
            malrow["Studios"] = content
        elif tag == "English":
            malrow["English Alternative Names"] = content
        elif tag == "Japanese":
            malrow["Japanese Alternative Names"] = content
        elif tag == "Synonyms":
            malrow["Synonyms"] = content

    # 2A. Information Right Side - Synopsis
    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_element_by_class_name("js-scrollfix-bottom-rel")
    synopsis_region = bord_region.find_element_by_tag_name('p')
    malrow["Synopsis"] = synopsis_region.text
    
    '''
    # 2B. Information Right Side - Staff
    big_region = bord_region.find_elements_by_class_name('pb24')[-1]
    char_region = big_region.find_elements_by_tag_name("div")[4]
    va_region = char_region.find_elements_by_class_name('borderClass')

    k = 0
    va_list = []
    for entry in va_region:
        if (k % 3 == 2):
            va_list.append(entry.text.split("\n")[0])
        k += 1

    malrow["Voice Actors"] = va_list
    '''
    
    # 2C. Information Right Side - Prequel/Sequel
    big_region = bord_region.find_elements_by_class_name('pb24')[-1]
    malrow["First"] = True
    
    preq_include = False
    seq_include = False
    try:
        char_region = big_region.find_element_by_class_name("anime_detail_related_anime")
        related_region = char_region.find_elements_by_tag_name('tr')

        malrow["Prequel"] = None
        malrow["Sequel"] = None
        for i in related_region:
            if ("Prequel:" in i.text):
                malrow["Prequel"] = i.text[9:]
                malrow["First"] = False
                preq_include = True
            if ("Sequel: " in i.text):
                malrow["Sequel"] = i.text[8:]
                seq_include = True
    except:
        malrow["Prequel"] = None
        malrow["Sequel"] = None
    
    
    time.sleep(np.random.rand() * 3 + 1)
    
    ### STATS ###
    
    # 3. Stats
    driver.get(link + "/stats")

    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_element_by_class_name("js-scrollfix-bottom-rel")
    stat_region = bord_region.find_elements_by_class_name('spaceit_pad')


    def stat2int(index, cutoff):
        return int(stat_region[index].text[cutoff:].replace(',', ''))

    def stat2score(index):
        return int(stat_region[index].text.replace('v', '(').split(' (')[1])

    malrow["Watching"] = stat2int(0, 9)
    malrow["Completed"] = stat2int(1, 11)
    malrow["On-Hold"] = stat2int(2, 9)
    malrow["Dropped"] = stat2int(3, 9)
    malrow["Plan to Watch"] = stat2int(4, 15)
    malrow["Total"] = stat2int(5, 7)
    
    
    if (len(stat_region) == 16):
        sum = 0
        for i in range(1, 11):
            malrow[str(i)] = stat2score(16 - i)
            sum += stat2score(16 - i)
        malrow["ScoredCount"] = sum
        for i in range(1, 11):
            malrow[str(i)] = malrow[str(i)] / sum * 100
    else:
        for i in range(1, 11):
            malrow[str(i)] = None
        
    
    '''
    # 4. Episodes
    driver.get(link + "/episode")
    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_element_by_class_name("js-scrollfix-bottom-rel") ##
    stat_region = bord_region.find_element_by_tag_name('td')
    bs_region = stat_region.find_element_by_class_name("border_solid")
    ep_region = bs_region.find_elements_by_tag_name("span")[-1]
    malrow["Episodes"] = int(ep_region.text.replace(")", "/").split("/")[1])
    '''

    time.sleep(np.random.rand() * 3 + 2)
    
    # 5. Recommendations (net_df exclusive)
    driver.get(link + "/userrecs")
    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_element_by_class_name("js-scrollfix-bottom-rel")
    rec_region = bord_region.find_elements_by_css_selector("[class='borderClass']")

    sources = []
    targets = []
    weights = []
    weight_total_filler = []
    weight_total = 0

    for rec in rec_region:

        sources.append(malrow["Title"])
        targets.append(rec.find_elements_by_tag_name("div")[3].text.split(" add")[0])

        try:
            weights.append(int(rec.find_element_by_class_name("spaceit").text.split(" more")[0].split("by ")[1]))
        except:
            weights.append(1)
        weight_total += weights[-1]
        weight_total_filler.append(1)

    temp_df = pd.DataFrame(list(zip(sources, targets, weights, weights, weight_total_filler)), 
                           columns=["Source", "Target", "Weight", "Raw", "Weight Total"])
    temp_df["Weight"] = temp_df["Weight"] / weight_total
    temp_df["Weight Total"] = temp_df["Weight Total"] * weight_total

    if preq_include:
        net_df = net_df.append({"Source":malrow["Prequel"], "Target":malrow["Title"], "Weight":1, "Raw":-1, "Weight Total":weight_total}, ignore_index=True)
    if seq_include:
        net_df = net_df.append({"Source":malrow["Title"], "Target":malrow["Sequel"], "Weight":1, "Raw":-1, "Weight Total":weight_total}, ignore_index=True)
    
    net_df = net_df.append(temp_df, ignore_index = True)

    time.sleep(np.random.rand() * 3 + 2)
    
    # Final steps
    df = df.append(malrow, ignore_index=True)
    return df, net_df

In [132]:
def scrape_50(curr_value, net_df):
    link = "https://myanimelist.net/anime.php?cat=anime&q=&type=3&score=0&status=2&p=0&r=0&sm=0&sd=0&sy=0&em=0&ed=0&ey=0&c%5B0%5D=a&c%5B1%5D=b&c%5B2%5D=c&c%5B3%5D=f&gx=0&show="

    driver.get(link + str(curr_value)) #x shows down


    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(1)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    wrapper_region = find_by_class("wrapper")
    content_region = wrapper_region.find_element_by_id("content")
    bord_region = content_region.find_elements_by_tag_name("table")[2]
    show_region = bord_region.find_elements_by_xpath("//*[@class='hoverinfo_trigger fw-b fl-l']")

    stopper = 3
    
    ps_df = pd.DataFrame(columns= ['Title', 'Type', 'English Alternative Names', 'Synonyms', 'Japanese Alternative Names', 
       'Episodes', 'Score', 'Studios', 'Start Year', 'Premiered', 
       'Hyperlink', 'Synopsis', 'Source', 'Genres',
       'Duration', 'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching',
       'Completed', 'On-Hold', 'Dropped', 'Plan to Watch', 'Total', '1', '2',
       '3', '4', '5', '6', '7', '8', '9', '10', 'ScoredCount', 'Aired',
       'Broadcast', 'End Year', 
       'Premiered Season',
       'Premiered Year', 'First', 'Prequel', 'Sequel'
    ])

    for show in show_region:
        if(stopper <= 0):
            break
        show.send_keys(Keys.CONTROL + Keys.ENTER)
        assert len(driver.window_handles) > 1
        driver.switch_to.window(driver.window_handles[1])

        ps_df, net_df = page_scrape(ps_df, driver.current_url, net_df)

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(np.random.rand() * 3 + 2)

        #stopper -= 1
    
    return ps_df, net_df

In [139]:
curr_value = 1500
max_value = 3000

net_df = pd.DataFrame()
all_df = pd.DataFrame()

while (curr_value < max_value):
    
    ps_df, net_df = scrape_50(curr_value, net_df)
    
    all_df = pd.concat([all_df, ps_df])
    
    curr_value += 50


In [140]:
all_df = all_df.reset_index(drop = True)


In [141]:
all_df

Unnamed: 0,Title,Type,English Alternative Names,Synonyms,Japanese Alternative Names,Episodes,Score,Studios,Start Year,Premiered,...,10,ScoredCount,Aired,Broadcast,End Year,Premiered Season,Premiered Year,First,Prequel,Sequel
0,Kyojin no Hoshi: Shukumei no Taiketsu,Movie,,,巨人の星 宿命の対決,1,,"None found, add some",1970,,...,10.5263,38,"Aug 1, 1970",,1970,,,True,,
1,Kyoudai Koguma,Movie,The Bears Brothers,,兄弟こぐま,1,5.471,"None found, add some",1932,,...,2.06897,145,1932,,1932,,,True,,
2,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Movie,Beyond the Boundary: I'll Be Here - Past,"Beyond the Boundary Movie, Kyokai no Kanata Movie",劇場版 境界の彼方 I'LL BE HERE 過去篇,1,7.721,Kyoto Animation,2015,,...,12.9752,57371,"Mar 14, 2015",,2015,,,True,,Kyoukai no Kanata Movie 2: I'll Be Here - Mira...
3,Kyoukai no Kanata Movie 2: I'll Be Here - Mira...,Movie,Beyond the Boundary: I'll Be Here - Future,"Beyond the Boundary Movie, Kyokai no Kanata Movie",劇場版 境界の彼方 I'LL BE HERE 未来篇,1,8.191,Kyoto Animation,2015,,...,17.3444,103342,"Apr 25, 2015",,2015,,,False,"Kyoukai no Kanata, Kyoukai no Kanata Movie 1: ...",
4,Kyouryoku Boukuusen,Movie,,,協力防空戦,1,5.081,"None found, add some",1942,,...,5.6,125,1942,,1942,,,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1488,Zounds,Movie,Zounds,,ZOUNDS,1,,"None found, add some",2003,,...,,,2003,,2003,,,True,,
1489,Zouressha ga Yatte Kita,Movie,,,象列車がやってきた,1,,Mushi Production,1992,,...,12.1951,41,"Jul 4, 1992",,1992,,,True,,
1490,Zunda Horizon,Movie,,"Wakate Animator Ikusei Project, 2017 Young Ani...",ずんだホライずん,1,5.621,"WAO World, Studio Live",2017,,...,6.34921,189,"Mar 11, 2017",,2017,,,True,,
1491,Zuori Qing Kong,Movie,Crystal Sky of Yesterday,Zuo Ri Qing Kong,昨日青空,1,6.761,Guton Animation Studio,2018,,...,4.99276,1382,"Oct 26, 2018",,2018,,,True,,


In [142]:
net_df

Unnamed: 0,Source,Target,Weight,Raw,Weight Total
0,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Kyoukai no Kanata Movie 2: I'll Be Here - Mira...,1,-1,4
1,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Subete ga F ni Naru,0.25,1,4
2,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Kekkai Sensen,0.25,1,4
3,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Naruto: Shippuuden,0.25,1,4
4,Kyoukai no Kanata Movie 1: I'll Be Here - Kako...,Naruto,0.25,1,4
...,...,...,...,...,...
3160,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou ...,Ao Haru Ride,0.0714286,1,14
3161,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou ...,Konbini Kareshi,0.0714286,1,14
3162,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou ...,Tamako Love Story,0.0714286,1,14
3163,Zutto Mae kara Suki deshita.: Kokuhaku Jikkou ...,Araburu Kisetsu no Otome-domo yo.,0.0714286,1,14


In [143]:
all_df.to_csv('all_df.csv')

In [144]:
net_df.to_csv('net_df.csv')