# Data Analysis
Purpose is to ping MAL's APIs to gather anime data. The data will then be saved into a `.csv` to be used for the DSC106 - Final Project Website.<br>
By: Jonathan Lo<br>
Date: 5/2/23

## Imports and Setup

In [10]:
# Imports
import json
import pandas as pd
import requests as r

from time import sleep
from tqdm import tqdm
from datetime import timedelta

In [2]:
# Unload Secrets
with open("secrets.json", 'r') as fh:
    secrets = json.load(fh)
    
mal_client_id = secrets['MAL-CID']

## Requests

In [3]:
endpoint = "https://api.myanimelist.net/v2/anime/"
headers = {"X-MAL-CLIENT-ID": mal_client_id}
fields = [
    "start_date",
    "end_date",
    "mean",
    "rank",
    "popularity",
    "num_list_users",
    "num_scoring_users",
    "media_type",
    "status",
    "genres",
    "start_season",
    "source",
    "rating",
    "recommendations",
    "studios"
]
payload = {"fields": str(fields).replace("'", "").replace('[', '').replace(']', '').replace(' ', '')}

def queryMALRange(top_range, timeout=0.75):
    """ Queries MyAnimeList's database for a certain range.
    """
    # Metadata
    print(f"The expected time to pull all data is: {timedelta(seconds=top_range * 0.875)}")
    
    # Init
    error_timeout = 20
    myanmielistData = []
    requery_ids = []
    session = r.Session()
    
    # First Loop
    for i in tqdm(range(1, top_range)):
        res = r.get(
            endpoint + str(i),
            params=payload,
            headers=headers
        )
        if res.status_code == 200:
            myanmielistData.append(res.json())
        elif res.status_code == 404:
            continue
        else:
            requery_ids.append(i)
            sleep(error_timeout)
        sleep(timeout)
        
    # Requery if any traffic errors
    for i in tqdm(requery_ids):
        res = r.get(
            endpoint + str(i),
            params=payload,
            headers=headers
        )
        if res.status_code == 200:
            myanmielistData.append(res.json())
        sleep(timeout)
        
    return myanmielistData

In [4]:
# Mass request
data = queryMALRange(50000)

The expected time to pull all data is: 12:09:10


100%|█████████████████████████████████████████████████████████████████████████| 49999/49999 [11:13:26<00:00,  1.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 768/768 [11:56<00:00,  1.07it/s]


In [5]:
# Saving data
with open("myanimelist-data.json", "w") as fh:
    json.dump(data, fh)

## Analysis

In [41]:
# Load data into a DataFrame
try:
    df = pd.DataFrame(data)
except NameError as e:
    with open("myanimelist-data.json", "r") as fh:
        malData = json.load(fh)
    df = pd.DataFrame(malData)

In [47]:
# Finding total number of nodes and edges
print("Number of nodes:", df["recommendations"].apply(lambda x: len(x) > 0).shape[0])
print("Number of edges:", df["recommendations"].apply(lambda x: len(x)).sum())

Number of nodes: 21577
Number of edges: 37327
