In [1]:
import pandas as pd

In [2]:
users = pd.read_csv('./myanimelist/users_filtered.csv');
animes = pd.read_csv('./myanimelist/anime_filtered.csv');

## Filtering UserAnime List proportionally
Filter only the user_anime relations that have a large proportion of watched_epiosodes per total number of episodes.

In [3]:
def anime_is_eligible(user_anime, watched_proportion):
    anime = animes[animes.anime_id == user_anime.anime_id].iloc[[0]]
    if (anime.episodes.item() > 0):
        return user_anime.my_watched_episodes/anime.episodes.item() >= watched_proportion
    else: return True

In [4]:
# Initializing user_anime Data Frame
users_animes_prop = pd.DataFrame();

# The size of each file chunk (if this is altered, we can't print a progress bar, so be careful)
chunksize = 10 ** 6

# If filtered, the minimum proportion of watched episodes per total number of episodes
proportion = 0.8

count = 0
print("Filtering UserAnime List proportionally, hold on...")
for chunk in pd.read_csv('./myanimelist/animelists_cleaned.csv', chunksize=chunksize):
    for index, row in chunk.iterrows():
        if anime_is_eligible(row, proportion):
            users_animes_prop = users_animes_prop.append(row)
            current_size = len(users_animes.index)
            print("Current Data Frame qtd of rows: %d" % current_size, end="\r")
            
    count += 1
    progress = (count/32) * 100
    print()
    print("%.2f%% read..." % progress, end="\r")
print("Done!")

Filtering UserAnime List proportionally, hold on...


KeyboardInterrupt: 

## Randomily selecting samples of the UserAnime List
Choose a random subset of the UserAnime List.

In [4]:
# Initializing user_anime Data Frame
users_animes_rand = pd.DataFrame();

# The size of each file chunk (if this is altered, we can't print a progress bar, so be careful)
chunksize = 10 ** 6

# Number of random samples per chunk
random_size = 100000

count = 0
print("Filtering UserAnime List randomily, hold on...")
for chunk in pd.read_csv('./myanimelist/animelists_cleaned.csv', chunksize=chunksize):
    users_animes_rand = users_animes_rand.append(chunk.sample(n=random_size, replace=True))
            
    count += 1
    progress = (count/32) * 100
    size = len(users_animes_rand.index)
    print("Current Data Frame row size: %d | %.2f%% completed..." % (size, progress), end="\r")
print()
print("Done! Run next cell to save Data Frame into a CSV file.")

Filtering UserAnime List randomily, hold on...
Current Data Frame row size: 3200000 | 100.00% completed...
Done! Run next cell to save Data Frame into a CSV file.


In [7]:
# Save random filtered to CSV file
users_animes_rand.to_csv("./manipulated_data/UserAnimeRandom.csv")