# Introduction

...

In [2]:
# libraries to install
# !pip install selenium

In [3]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm # useful for progress bars
from selenium import webdriver
import pandas as pd
import numpy as np
import time
import re
import os 

## 1. Data Collection

...

### 1.1. Get the list of animes

Here we will extract the *urls* and the *names* of the animes in the list (cfr. https://myanimelist.net/topanime.php). At first we can have an idea of the necessary steps to extract the informations we want by working on a single anime in the list and then proceed by iteration. 

After inspecting the HTML code of the site, we saw that the all the informations we need from a single anime are stored in  `tr` blocks inside a single `table` that contains the list of all the top animes in the site. To get the  name of an anime in the list we should work on `a` tags, whereas to get the url we need to work on `td` tags (leveraging the property `href`). 

Knowing these HTML details we can use the `selenium` library to do the web-scrapping.

In [4]:
from selenium.webdriver.chrome.service import Service

In [5]:
s = Service('/Users/dany/Desktop/adm-hw3/chromedriver')

In [6]:
# selenium with Chrome
driver = webdriver.Chrome(service=s)

In [7]:
# create a dataframe with links of each anime
df = pd.DataFrame(columns = ['Href'])

The following code was inspired by looking at the work that was done last year about https://www.goodreads.com, for example by https://github.com/GiorgiaSalvatori/ADM-HW3/blob/main/main.ipynb. Also the following post was useful https://towardsdatascience.com/how-to-use-selenium-to-web-scrape-with-example-80f9b23a843a.

In [9]:
# go page by page and and store links in a list
anime_list = []

for page in tqdm(range(0, 400)):
    url = 'https://myanimelist.net/topanime.php?limit=' + str(page * 50)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for tag in soup.find_all('tr'):
        links = tag.find_all('a')
        for link in links:        
            if type(link.get('id')) == str and len(link.contents[0]) > 1:
                anime_list.append((link.contents[0], link.get('href')) )

100%|██████████| 400/400 [06:54<00:00,  1.04s/it]


In [10]:
# total number of animes
print(len(anime_list))

19124


In [11]:
# assign list to dataframe
df['Href'] = anime_list

In [12]:
# check for duplicates: ok no duplicates
df['Href'].nunique()

19123

In [13]:
# save dataframe into a csv file without header and comma separator. 
# this is equivalent to a txt file, but with also the names of the animes
# that can be of help in some data processing stages. 
df.to_csv('urls.csv',sep = ' ', header=False)

In [14]:
df['Href'].head()

0    (Fullmetal Alchemist: Brotherhood, https://mya...
1    (Gintama°, https://myanimelist.net/anime/28977...
2    (Shingeki no Kyojin Season 3 Part 2, https://m...
3    (Steins;Gate, https://myanimelist.net/anime/92...
4    (Fruits Basket: The Final, https://myanimelist...
Name: Href, dtype: object

We could also create a dictionary, this is useful in some circumnstances.

In [15]:
#keys
name = []   
#values
url = []    

for item in anime_list:
    name.append(item[0])
    url.append(item[1])
    
D = dict(zip(name, url))
print(list(D.values())[0:5])

['https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood', 'https://myanimelist.net/anime/28977/Gintama°', 'https://myanimelist.net/anime/38524/Shingeki_no_Kyojin_Season_3_Part_2', 'https://myanimelist.net/anime/9253/Steins_Gate', 'https://myanimelist.net/anime/42938/Fruits_Basket__The_Final']


## 1.2. Crawl animes

We procede to:
- download the html corresponding to each of the collected urls;
- save its html in a file;
- organize the entire set of downloaded html pages into folders. Each folder will contain the htmls of the animes in page 1, page 2, ... of the list of animes.

To do so we extensively use the `os` library to create directories, changing paths, etc...

In [17]:
# remark: the execution can take quite some time, for this reason I will stop at the first 5 animes.
# remark : there is an issue with high frequency site-connections 

# returns current working directory
base = os.getcwd()  
t = 0
# we use the previously created dictionary D to get the urls we need
scrapped_urls = list(D.values())[0:5]
for i in range(len(scrapped_urls)):
    if(i%50==0):
        # create a new folder
        # remark: the pages will start from 0
        page_identifier = i-(49*t)
        # subdirectory
        directory = f"page_{page_identifier}.html"
        # parent directories
        parent_dir = base
        # path
        path = os.path.join(parent_dir, directory)
        # make directory
        os.makedirs(path)
        # checkpoint
        #print("Directory '%s' created" %directory)
        # change directory 
        os.chdir(path)
        t += 1
        
    # get urls
    URL = scrapped_urls[i]
    page = requests.get(URL)
    
    # parsing
    soup_data = BeautifulSoup(page.content, "html.parser")
    
    # saving
    with open(f"article_{i}.html", "w") as file:
        file.write(str(soup_data))
        
    # checkpoint
    #print(f"Article {i} successfully written!")