<h1 align=center>Anime Recommendations</h1>

### Repository: https://github.com/jose-alvarado-guzman/anime_recommendations

## Loading Python Packages

In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from collections import ChainMap
from graphdatascience import GraphDataScience
import pandas as pd
import pathlib
from fileload.loadyaml import load_yaml_file
from database.neo4jdbms import Neo4jInstance

## Data Ingestion

### Loading files into Pandas DataFrames

In [None]:
def read_file_from_url(url:str):
    file_format = pathlib.Path(url).suffixes[0]
    response = urlopen(url)
    # Downloading the file and saving it in memory
    if file_format == '.zip':
        compressed_file = ZipFile(BytesIO(response.read()))
        # Decompressing the file and loading it into a Python Data Frame
        file=compressed_file.open(compressed_file.namelist().pop())
    elif file_format == '.csv':
        file = url
    return pd.read_csv(file)

In [None]:
data_files = {'anime':'anime.zip',
              'anime_with_synopsis':'anime_with_synopsis.zip',
              'animelist':'animelist.zip',
              'watching_status':'watching_status.csv'
             }
data_frames = dict()
s3_bucket = 'https://s3.amazonaws.com/neo4j-ps-ds-bootcamp/data/'

In [None]:
for key, value in data_files.items():
    file = s3_bucket + value
    data_frames[key] = read_file_from_url(file)

In [None]:
data_frames['anime'].info()

In [None]:
data_frames['anime_with_synopsis'].info()

In [None]:
data_frames['animelist'].info()

In [None]:
data_frames['anime_with_synopsis'].info()

In [None]:
anime = pd.merge(data_frames['anime'],data_frames['anime_with_synopsis'][['MAL_ID','sypnopsis']],
                 on='MAL_ID',how='left')
anime_ratings = pd.merge(data_frames['animelist'],data_frames['watching_status'],
                         left_on='watching_status',right_on='status').drop('status',axis=1)

In [None]:
anime.columns = [str.lower(str.replace(c,' ','_').replace('-','_')) for c in anime.columns]

In [None]:
anime.columns

In [None]:
anime['genres'].value_counts()

In [None]:
anime['type'].value_counts()

In [None]:
anime['source'].value_counts()

### Loading the DataFrames into Neo4j

In [None]:
yaml_file = load_yaml_file('ingest.yaml')

In [None]:
graph = Neo4jInstance(yaml_file['dbms_info']['uri'],yaml_file['dbms_info']['user_name'],
                     yaml_file['dbms_info']['password'])

In [None]:
graph.execute_write_queries(yaml_file['pre_ingest'],yaml_file['dbms_info']['database'])

In [None]:
%%time
graph.execute_write_queries_with_data(yaml_file['queries']['films'],anime,
                                     yaml_file['dbms_info']['database'])

In [None]:
users = pd.DataFrame({'user_id':anime_ratings['user_id'].unique()})

In [None]:
graph.execute_write_query_with_data(yaml_file['queries']['users'],users,
                                    yaml_file['dbms_info']['database'])

In [None]:
anime_ratings.columns = [str.replace(c,' ','') for c in anime_ratings.columns]

In [None]:
mappings_list = [{s:str.replace(s,' ','_').upper()} 
                 for s in anime_ratings['description'].unique()]
mappings = dict(ChainMap(*mappings_list))
anime_ratings['description'] = anime_ratings['description'].map(mappings)

In [None]:
anime_ratings.columns

In [None]:
ratings = { 
    'currently_watching': anime_ratings[anime_ratings['description']=='CURRENTLY_WATCHING'],
    'completed' : anime_ratings[anime_ratings['description']=='COMPLETED'],
    'on_hold' : anime_ratings[anime_ratings['description']=='ON_HOLD'],
    'dropped' : anime_ratings[anime_ratings['description']=='DROPPED'],
    'plan_to_watch' : anime_ratings[anime_ratings['description']=='PLAN_TO_WATCH']
}

In [None]:
for k in ratings.keys():
    graph.execute_write_query_with_data(yaml_file['queries'][k],ratings[k],
                                       yaml_file['dbms_info']['database'],
                                        batch_size=100000)

![Data Model](data_model.png)

## GDS Pipeline

In [None]:
gds = GraphDataScience(yaml_file['dbms_info']['uri'],auth=(yaml_file['dbms_info']['user_name'],
                     yaml_file['dbms_info']['password']))
gds.set_database('anime')

In [None]:
gds.debug.sysInfo()

### GDS Pipelie for user similar films recommendations

In [None]:
film_projection, film_stats = gds.graph.project(
    'film',
    ['Film','Studio','Source','Genre','Licensor','Producer'],
    ['HAS_STUDIO','HAS_SOURCE','HAS_GENRE','HAS_LICENSOR','HAS_PRODUCER'],
    relationshipProperties=['rating']
)

In [None]:
film_stats

In [None]:
gds.fastRP.mutate(
    film_projection,
    iterationWeights= [1.0],
    embeddingDimension = 1054,
    mutateProperty = 'embedding'
)

In [None]:
gds.knn.mutate(
    film_projection,
    topK = 20,
    sampleRate = 1.0,
    similarityCutoff = 0.1,
    nodeProperties = ['embedding'],
    nodeLabels = ['Film'],
    mutateProperty = 'similarityScore',
    mutateRelationshipType = 'HAS_SIMILAR_FILM'
)

In [None]:
gds.graph.writeNodeProperties(
    film_projection,
    ['embedding'],
    ['Film']
)

In [None]:
gds.graph.writeRelationship(
    film_projection,
    'HAS_SIMILAR_FILM',
    'similarityScore'
)

In [None]:
film_projection.drop()

### GDS Pipelie for user similar films recommendations

In [None]:
user_projection, user_projection_stats = gds.graph.project(
    'user_projection',
    ['User','Film'],
    ['PLAN_TO_WATCH','DROPPED','COMPLETED','CURRENTLY_WATCHING','ON_HOLD'],
    relationshipProperties = ['rating']
)

In [None]:
sys_info = gds.debug.sysInfo()
sys_info[sys_info['key'].str.contains('heap')]

In [None]:
gds.fastRP.mutate(
    user_projection,
    iterationWeights= [1.0],
    embeddingDimension = 1054,
    mutateProperty = 'embedding',
    relationshipWeightProperty = 'rating'
)

In [None]:
gds.knn.mutate(
    user_projection,
    topK = 10,
    sampleRate = 0.6,
    similarityCutoff = 0.2,
    nodeProperties = ['embedding'],
    nodeLabels = ['User'],
    mutateProperty = 'similarityScore',
    mutateRelationshipType = 'HAS_SIMILAR_USER'
)

In [None]:
gds.graph.writeNodeProperties(
    user_projection,
    ['embedding'],
    ['User']
)

In [None]:
gds.graph.writeRelationship(
    user_projection,
    'HAS_SIMILAR_USER',
    'similarityScore'
)

In [None]:
user_projection.drop()

### Making similar films recommendations

In [None]:
films_recommendations = gds.run_cypher(yaml_file['queries']['similar_films_recommendations'],
                                     params={'user_id':0})

In [None]:
films_recommendations

### Making similar users recommendations

In [None]:
users_recommendations = gds.run_cypher(yaml_file['queries']['similar_users_recommendations'],
                                     params={'user_id':0})

In [None]:
users_recommendations

In [None]:
graph.close()
gds.close()