In [1]:
'''
CRISP-ML(Q) process model describes six phases:
# - Business and Data Understanding
# - Data Preparation
# - Model Building
# - Model Evaluation and Hyperparameter Tuning
# - Model Deployment
# - Monitoring and Maintenance

# Business Problem: One of the e-commerce customer is facing this issue: 
    Number of products sold because of cross selling and up selling has reduced significantly

# Objective(s): Maximize Product Sales
# Constraint(s): Minimize the Coupon Fatigue

Success Criteria:
    a. Business: Increase the sales by 18% to 20%
    b. ML: 
    c. Economic: Additional revenue of $250K to $300K
    
    Data Collection: 
        Dimension: 12294 rows and 7 columns
        1. anime_id
        2. name
        3. genre
        4. type
        5. episodes
        6. rating
        7. members   
'''




'\nCRISP-ML(Q) process model describes six phases:\n# - Business and Data Understanding\n# - Data Preparation\n# - Model Building\n# - Model Evaluation and Hyperparameter Tuning\n# - Model Deployment\n# - Monitoring and Maintenance\n\n# Business Problem: One of the e-commerce customer is facing this issue: \n    Number of products sold because of cross selling and up selling has reduced significantly\n\n# Objective(s): Maximize Product Sales\n# Constraint(s): Minimize the Coupon Fatigue\n\nSuccess Criteria:\n    a. Business: Increase the sales by 18% to 20%\n    b. ML: \n    c. Economic: Additional revenue of $250K to $300K\n    \n    Data Collection: \n        Dimension: 12294 rows and 7 columns\n        1. anime_id\n        2. name\n        3. genre\n        4. type\n        5. episodes\n        6. rating\n        7. members   \n'

In [2]:
# Importing all required libraries, modules
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
# term frequency - inverse document frequency is a numerical statistic that is intended to reflect how important 
# a word is to document in a collecion or corpus
# from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import joblib




In [3]:
# import Dataset 
anime = pd.read_csv(r"dataset\anime.csv", encoding = 'utf8')


In [4]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Toy Story (1995),"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Jumanji (1995),"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Waiting to Exhale (1995),"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Father of the Bride Part II (1995),"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [5]:
# Database Connection
from sqlalchemy import create_engine, text

user = 'root'
password = 'mysql'
db = 'recommenddb'

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}".format(user = user, 
                                                                           pw = password, 
                                                                           db = db))



In [6]:

# create database
engine.execute(f"CREATE DATABASE IF NOT EXISTS {db}")


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1dc35644b20>

In [7]:

# Upload the Table into Database
anime.to_sql('anime', con = engine, if_exists = 'replace', chunksize = 1000, index = False)



12294

In [8]:
# Read the Table (data) from MySQL database
sql = 'select * from anime'
anime = pd.read_sql_query(sql, con = engine)



In [9]:
# Check for Missing values
anime["genre"].isnull().sum() 



62

In [10]:
# Impute the Missing values in 'genre' column for a movie with 'General' category
anime["genre"] = anime["genre"].fillna("General")



In [11]:
# Create a Tfidf Vectorizer to remove all stop words

tfidf = TfidfVectorizer(stop_words = "english")   # taking stop words from tfidf vectorizer 



In [12]:
# Transform a count matrix to a normalized tf-idf representation
tfidf_matrix = tfidf.fit(anime.genre)   



In [13]:
# Save the Pipeline for tfidf matrix
joblib.dump(tfidf_matrix, 'matrix')

os.getcwd()



'C:\\git_repositories\\movie_recommendation'

In [14]:
# Load the saved model for processing
mat = joblib.load("matrix")

tfidf_matrix = mat.transform(anime.genre)

tfidf_matrix.shape 



(12294, 47)

In [15]:
pd.DataFrame(tfidf_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.440247,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.544416,0.000000,0.0,0.0,0.0
1,0.294649,0.317607,0.0,0.0,0.0,0.000000,0.0,0.0,0.335834,0.0,...,0.350987,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.250631,0.000000,0.0,0.0,0.0,0.200766,0.0,0.0,0.000000,0.0,...,0.298553,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.833769,0.0,0.0,0.0
4,0.250631,0.000000,0.0,0.0,0.0,0.200766,0.0,0.0,0.000000,0.0,...,0.298553,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
12290,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
12291,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
12292,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [16]:
# cosine(x, y)= (x.y) / (||x||.||y||)
# Computing the cosine similarity on Tfidf matrix

cosine_sim_matrix = cosine_similarity(tfidf_matrix)



In [17]:
cosine_sim_matrix

array([[1.        , 0.14784981, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14784981, 1.        , 0.1786367 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.1786367 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [18]:
# Create a mapping of anime name to index number
anime_index = pd.Series(anime.index, index = anime['name']).drop_duplicates()



In [19]:
# Example
anime_id = anime_index["Assassins (1995)"]
anime_id

topN = 5



In [20]:
anime_id

22

In [21]:
cosine_scores = list(enumerate(cosine_sim_matrix[anime_id]))
cosine_scores = sorted(cosine_scores, key = lambda x:x[1], reverse = True)
cosine_scores_N = cosine_scores[0: topN + 1]
cosine_scores_N

[(22, 1.0),
 (1465, 1.0),
 (1073, 0.9672093516893668),
 (1398, 0.9672093516893668),
 (2333, 0.9672093516893668),
 (2735, 0.9672093516893668)]

In [22]:
cosine_sim_matrix[22]

array([0.1590967 , 0.32333354, 0.33305531, ..., 0.        , 0.        ,
       0.        ])

In [23]:
# Custom function to find the TopN movies to be recommended

def get_recommendations(Name, topN):    
    # topN = 10
    # Getting the movie index using its title 
    anime_id = anime_index[Name]
    
    # Getting the pair wise similarity score for all the anime's with that anime
    cosine_scores = list(enumerate(cosine_sim_matrix[anime_id]))
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores, key = lambda x:x[1], reverse = True)
    
    # Get the scores of top N most similar movies 
    cosine_scores_N = cosine_scores[0: topN + 1]
    
    # Getting the movie index 
    anime_idx  =  [i[0] for i in cosine_scores_N]
    anime_scores =  [i[1] for i in cosine_scores_N]
    
    # Similar movies and scores
    anime_similar_show = pd.DataFrame(columns = ["name", "Score"])
    anime_similar_show["name"] = anime.loc[anime_idx, "name"]
    anime_similar_show["Score"] = anime_scores
    anime_similar_show.reset_index(inplace = True)  
    # anime_similar_show.drop(["index"], axis=1, inplace=True)
    return(anime_similar_show.iloc[1:, ])



In [24]:
# Call the custom function to make recommendations
rec = get_recommendations("No Game No Life Movie", topN = 10)
rec


Unnamed: 0,index,name,Score
1,10919,No Game No Life Movie,1.0
2,10436,Super Real Mahjong: Mahjong Battle Scramble - ...,0.859206
3,4290,Raising Victor Vargas (2002),0.827579
4,5882,xXx: State of the Union (2005),0.800258
5,5968,Pusher II: With Blood on My Hands (2004),0.800258
6,6116,Revolution (1985),0.800258
7,6677,World on a Wire (Welt am Draht) (1973),0.800258
8,10435,Super Real Mahjong: Kasumi Miki Shouko no Haji...,0.800258
9,4628,"Italian Job, The (1969)",0.787476
10,6812,"Midnight Meat Train, The (2008)",0.739464


In [25]:
anime[anime["anime_id"]==1465]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
3205,1465,"Night, The (Notte, La) (1960)","Action, Mecha, School, Sci-Fi, Seinen",TV,10,7.05,3300
