In [95]:
import json
import pandas as pd
import requests
import time
import numpy as np  # linear algebra
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from collections import Counter
from urllib.request import urlopen, Request
from PIL import Image
import requests
from PIL import Image
import requests
from io import BytesIO
from matplotlib import cm

# Get Categories

In [3]:
"""
Gets all anime categories from the Kitsu
API, returns it, and saves it as a csv file.
"""
def get_cats_df():
  categories = pd.DataFrame()
  # first page
  basepath = 'https://kitsu.io/api/edge/categories?fields[categories]=title&page[limit]=10'
  url = basepath

  # get all of the categories from the kitsu api (218 categories)
  for page in range(22):
    # each page in the api has only 10 categories
    if page > 0:
      url = basepath + '&page[offset]=' + str(page*10)
    # get the json from the page, add to dataframe
    d = requests.get(url)
    j = json.loads(d.content)
    df = pd.json_normalize(j, 'data')
    categories = categories.append(df)
    
  # drop all categories except category name
  categories = categories.drop(columns=['links.self','type','id'])
  categories.rename(columns={'attributes.title':'category'}, inplace=True)
  # reset indices
  categories.reset_index(inplace = True, drop = True)
  # save as csv
  categories.to_csv('categories.csv', index = False)
  
  return categories

Get the categories

In [4]:
cats = get_cats_df()
print(cats.shape)
cats.head()

(218, 1)


Unnamed: 0,category
0,Middle School
1,Cycling
2,Elementary School
3,Mermaid
4,Android


Get the anime names and posters

In [70]:

def get_animes_df(category, num_pages=1):
  """Returns information for the top animes of a given category

  Args:
      category (string): the given category
      num_pages (int): pages of animes to return (each page has 10 animes)

  Returns:
      dataframe: information for the top animes of the category
  """
  
  basepath = 'https://kitsu.io/api/edge/anime'
  animes = pd.DataFrame()

  # get the animes for the category
  for page in range(num_pages):
    # each page in the api has only 10 animes
    url = basepath + '?page[limit]=10&page[offset]=' + str(page*10) + '&filter[categories]=' + category + "&sort=popularityRank"
    print("URL = " + url)
    d = requests.get(url)
    # get the json content from the url, add to dataframe
    j = json.loads(d.content)
    df = pd.json_normalize(j, 'data')
    animes = animes.append(df)

  animes.reset_index(drop=True, inplace=True)
  
  # get the relevant columns only
  to_keep = ['attributes.updatedAt','attributes.canonicalTitle','attributes.startDate',
             'attributes.endDate','attributes.posterImage.large']
  return animes[to_keep]

def get_image(url):
  """Returns the image from the url as an Image object

  Args:
      url (string): the url that leads to the image

  Returns:
      Image: the image at the url
  """
  response = requests.get(url)
  img = Image.open(BytesIO(response.content))
  # img.show()
  return img



def combine_images(cat_animes, cat_name):
  """Combine multiple anime poster images for a given category

  Args:
      cat_animes (dataframe): information for the top animes of the category
      cat_name (string): the name of the given category

  Returns:
      Image: an image that combines the anime posters horizontally
  """

  images = []

  for url in cat_animes['attributes.posterImage.large'].to_numpy():
    images.append(get_image(url))

  # specify width and height for new image
  widths, heights = zip(*(i.size for i in images))
  total_width = sum(widths)
  max_height = max(heights)
  # new empty image
  combine_im = Image.new('RGB', (total_width, max_height))
  # put the images together in combine_im
  x_offset = 0
  for im in images:
      combine_im.paste(im, (x_offset, 0))
      x_offset += im.size[0]

  # save the image and return it
  name_str = 'combined_images_2/' + cat_name + '.jpg'
  combine_im.save(name_str)
  return combine_im

In [104]:
num_colors = 30

# not including Middle School - add later
not_include = ['Middle School', 'Cycling', 'Elementary School', 'Mermaid','Tentacle','Loli','Super Deformed', 'Sudden Girlfriend Appearance', 'The Arts', 'Voyeurism', 'Ahegao', 'Netorare', 'Ecchi', 'Slavery', 'Dark Skinned Girl', 'Female Teacher', 'Female Student', 'Content Indicators', 'Dynamic', 'Elements', 'Setting', 'Themes', 'Yaoi', 'Target Demographics', 'Anime Influenced', 'Housewives']


# for i in range(len(cats)):
for i in range(85,len(cats)):
  cat = cats['category'][i]
  print(i, cat)
  cat = "%20".join(cat.split())

  if not cats['category'][i] in not_include:
    cat_animes = get_animes_df(cat, 1)
    cat_animes.dropna(subset=['attributes.canonicalTitle'], inplace=True)
    # print(cat_animes.head(5))
    # get_palette.get_colors_mult_img(get_palette.get_url_list(cat_animes), num_colors, cat)

    # only getting the combined imgs
    # urls = get_url_list(cat_animes)
    combine_images(cat_animes, cat)
  else:
    print('bad category:', cat)
    
  if i % 10 == 0 and i != 0:
    print(i, "waiting a bit so we aren't rate limited")
    time.sleep(60)

78 Korea
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=Korea&sort=popularityRank
79 China
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=China&sort=popularityRank
80 Russia
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=Russia&sort=popularityRank
80 waiting a bit so we aren't rate limited
81 Tokyo
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=Tokyo&sort=popularityRank
82 Kyoto
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=Kyoto&sort=popularityRank
83 New York
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=New%20York&sort=popularityRank
84 Heian Period
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page[offset]=0&filter[categories]=Heian%20Period&sort=popularityRank
85 Victorian Period
URL = https://kitsu.io/api/edge/anime?page[limit]=10&page

ChunkedEncodingError: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))