In [1]:
import os
import time
import gc
import argparse

# data science imports
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
df_loans=pd.read_csv('Data/df_created_.csv')

## print shape of dataset with rows and columns
print(df_loans.shape)

df_loans.info()

(17079, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17079 entries, 0 to 17078
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Min_IT_Loan_ID__c           17079 non-null  float64
 1   Opp_Number__c               17079 non-null  float64
 2   Id                          17079 non-null  object 
 3   AccountID                   17079 non-null  object 
 4   Number_Of_Loans_Granted__c  17079 non-null  float64
 5   Num_Of_Loans_Paid__c        17079 non-null  float64
 6   Purpose_of_Loan__c          17079 non-null  object 
 7   Total_Repayments__c         17079 non-null  float64
 8   Amount                      17079 non-null  float64
 9   Term_in_Weeks__c            17079 non-null  float64
 10  Payment_Frequency__c        17079 non-null  object 
 11  StageName                   17079 non-null  object 
 12  Applicant Age               17079 non-null  float64
 13  userId             

In [4]:
df_loans.head()

Unnamed: 0,Min_IT_Loan_ID__c,Opp_Number__c,Id,AccountID,Number_Of_Loans_Granted__c,Num_Of_Loans_Paid__c,Purpose_of_Loan__c,Total_Repayments__c,Amount,Term_in_Weeks__c,Payment_Frequency__c,StageName,Applicant Age,userId,loanId,count,LoanIdFormat
0,1014484.0,6327397.0,0062x00000Ec3OhAAJ,0012x00000hYeJ7AAK,6.0,3.0,Living Expenses,4.0,1250.0,16.0,Monthly,Loan Paid,39.0,10132,12931,0,1
1,945258.0,5519027.0,0062x00000E3omCAAR,0010K00001cQWDjQAO,47.0,46.0,Travel Expenses,5.0,2050.0,20.143,Monthly,Loan Paid,43.0,37,10736,5,1
2,919931.0,5180314.0,0062x00000DrdLCAAZ,0010K00001cQWDjQAO,47.0,45.0,Travel Expenses,4.0,2050.0,16.429,Monthly,Loan Paid,43.0,37,2729,5,1
3,939812.0,5431995.0,0062x00000DVGTYAA5,0010K00001hTzwHQAS,46.0,44.0,Living Expenses,5.0,2050.0,19.857,Monthly,Loan Paid,36.0,146,2281,5,1
4,911555.0,5098387.0,0062x00000DZWmGAAX,0010K00001cQWDjQAO,47.0,44.0,Home Maintenance & Repairs,2.0,2000.0,5.0,Monthly,Loan Paid,43.0,37,2608,5,1


In [5]:
df_loans.columns

Index(['Min_IT_Loan_ID__c', 'Opp_Number__c', 'Id', 'AccountID',
       'Number_Of_Loans_Granted__c', 'Num_Of_Loans_Paid__c',
       'Purpose_of_Loan__c', 'Total_Repayments__c', 'Amount',
       'Term_in_Weeks__c', 'Payment_Frequency__c', 'StageName',
       'Applicant Age', 'userId', 'loanId', 'count', 'LoanIdFormat'],
      dtype='object')

In [6]:
df_loans_filter= df_loans.drop(['Min_IT_Loan_ID__c', 'Opp_Number__c', 'Id', 'AccountID',
                               'Number_Of_Loans_Granted__c', 'Num_Of_Loans_Paid__c',
                               'Purpose_of_Loan__c', 'Total_Repayments__c', 'Amount',
                               'Term_in_Weeks__c', 'Payment_Frequency__c', 'StageName',
                               'Applicant Age'],axis=1)

# print shape of dataset with rows and columns
print(df_loans_filter.shape)

df_loans_filter.info()

(17079, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17079 entries, 0 to 17078
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   userId        17079 non-null  int64
 1   loanId        17079 non-null  int64
 2   count         17079 non-null  int64
 3   LoanIdFormat  17079 non-null  int64
dtypes: int64(4)
memory usage: 533.8 KB


In [7]:
df_loans_filter.head()

Unnamed: 0,userId,loanId,count,LoanIdFormat
0,10132,12931,0,1
1,37,10736,5,1
2,37,2729,5,1
3,146,2281,5,1
4,37,2608,5,1


In [18]:
df_loans_filter.userId.value_counts() 

userId
2870     22
1571     22
1614     20
10138    20
513      18
         ..
4178      1
4864      1
4260      1
4622      1
6711      1
Name: count, Length: 8179, dtype: int64

In [19]:
df_loans_filter.loanId.value_counts() 

loanId
4371     10
13539    10
13438    10
13721    10
13512    10
         ..
6988      1
7046      1
7112      1
7076      1
8806      1
Name: count, Length: 10576, dtype: int64

In [12]:
df_loans_cnt = pd.DataFrame(
            df_loans_filter.groupby('loanId').size(),
            columns=['count'])
popular_loans = list(set(df_loans_cnt.query('count >= 10 ').index))  # noqa
loans_filter = df_loans_filter.loanId.isin(popular_loans).values

df_users_cnt = pd.DataFrame(
            df_loans_filter.groupby('userId').size(),
            columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 22').index))  # noqa
users_filter = df_loans_filter.userId.isin(active_users).values

In [13]:
loans_filter

array([False, False, False, ..., False, False, False])

In [14]:
users_filter

array([False, False, False, ..., False, False, False])

In [15]:
df_ratings_filtered = df_loans_filter[loans_filter & users_filter]

In [16]:
df_ratings_filtered.head()

Unnamed: 0,userId,loanId,count,LoanIdFormat


In [20]:
ddff = df_loans_filter.drop_duplicates(subset=['userId'], keep='first', inplace=False)

In [21]:
ddff.head()

Unnamed: 0,userId,loanId,count,LoanIdFormat
0,10132,12931,0,1
1,37,10736,5,1
3,146,2281,5,1
5,539,3141,5,1
6,1237,2772,4,1


In [22]:
ddff.shape

(8179, 4)

In [24]:
# pivot and create movie-user matrix
loans_user_mat = ddff.pivot(
    index='loanId', columns='userId', values='count').fillna(0)

In [25]:
loans_user_mat

userId,1,2,3,4,5,6,7,8,9,10,...,10134,10135,10136,10137,10138,10139,10140,10141,10142,10143
loanId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# create mapper from loan title to index
hashmap = {
        movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
}

In [43]:
hashmap

{'Toy Story (1995)': 0,
 'Jumanji (1995)': 1,
 'Grumpier Old Men (1995)': 2,
 'Heat (1995)': 3,
 'Sabrina (1995)': 4,
 'GoldenEye (1995)': 5,
 'American President, The (1995)': 6,
 'Casino (1995)': 7,
 'Sense and Sensibility (1995)': 8,
 'Ace Ventura: When Nature Calls (1995)': 9,
 'Get Shorty (1995)': 10,
 'Leaving Las Vegas (1995)': 11,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': 12,
 'Babe (1995)': 13,
 'Dead Man Walking (1995)': 14,
 'Clueless (1995)': 15,
 'Seven (a.k.a. Se7en) (1995)': 16,
 'Pocahontas (1995)': 17,
 'Usual Suspects, The (1995)': 18,
 "Mr. Holland's Opus (1995)": 19,
 'From Dusk Till Dawn (1996)': 20,
 'Broken Arrow (1996)': 21,
 'Happy Gilmore (1996)': 22,
 'Braveheart (1995)': 23,
 'Taxi Driver (1976)': 24,
 'Birdcage, The (1996)': 25,
 'Bad Boys (1995)': 26,
 'Apollo 13 (1995)': 27,
 'Batman Forever (1995)': 28,
 'Casper (1995)': 29,
 'Congo (1995)': 30,
 'Crimson Tide (1995)': 31,
 'Desperado (1995)': 32,
 'Die Hard: With a Vengeance (1995)': 33,
 'First Kni

In [44]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [45]:
movie_user_mat_sparse

<450x385 sparse matrix of type '<class 'numpy.float32'>'
	with 37117 stored elements in Compressed Sparse Row format>

In [12]:
# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

In [13]:
df_movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
class KnnRecommender:
    """
    This is an item-based collaborative filtering recommender with
    KNN implmented by sklearn
    """
    def __init__(self, path_movies, path_ratings):
        """
        Recommender requires path to data: movies data and ratings data

        Parameters
        ----------
        path_movies: str, movies data file path

        path_ratings: str, ratings data file path
        """
        self.path_movies = path_movies
        self.path_ratings = path_ratings
        self.movie_rating_thres = 0
        self.user_rating_thres = 0
        self.model = NearestNeighbors()

    def set_filter_params(self, movie_rating_thres, user_rating_thres):
        """
        set rating frequency threshold to filter less-known movies and
        less active users

        Parameters
        ----------
        movie_rating_thres: int, minimum number of ratings received by users

        user_rating_thres: int, minimum number of ratings a user gives
        """
        self.movie_rating_thres = movie_rating_thres
        self.user_rating_thres = user_rating_thres

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        set model params for sklearn.neighbors.NearestNeighbors
        (20, 'brute', 'cosine', -1)
        Parameters
        ----------
        n_neighbors: int, optional (default = 5)

        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional

        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']

        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs})

    def _prep_data(self):
        """
        prepare data for recommender

        1. movie-user scipy sparse matrix
        2. hashmap of movie to row index in movie-user scipy sparse matrix
        """
        # read data
        df_movies = pd.read_csv(
            os.path.join(self.path_movies),
            usecols=['movieId', 'title'],
            dtype={'movieId': 'int32', 'title': 'str'})
        df_ratings = pd.read_csv(
            os.path.join(self.path_ratings),
            usecols=['userId', 'movieId', 'rating'],
            dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
        # filter data
        df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])
        popular_movies = list(set(df_movies_cnt.query('count >= @self.movie_rating_thres').index))  # noqa
        movies_filter = df_ratings.movieId.isin(popular_movies).values

        df_users_cnt = pd.DataFrame(
            df_ratings.groupby('userId').size(),
            columns=['count'])
        active_users = list(set(df_users_cnt.query('count >= @self.user_rating_thres').index))  # noqa
        users_filter = df_ratings.userId.isin(active_users).values

        df_ratings_filtered = df_ratings[movies_filter & users_filter]

        # pivot and create movie-user matrix
        movie_user_mat = df_ratings_filtered.pivot(
            index='movieId', columns='userId', values='rating').fillna(0)
        # create mapper from movie title to index
        hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
        }
        # transform matrix to scipy sparse matrix
        movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

        # clean up
        del df_movies, df_movies_cnt, df_users_cnt
        del df_ratings, df_ratings_filtered, movie_user_mat
        gc.collect()
        return movie_user_mat_sparse, hashmap

    def _fuzzy_matching(self, hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None

        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data

        fav_movie: str, name of user input movie

        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap, fav_movie, n_recommendations):
        """
        return top n similar movie recommendations based on user's input movie

        Parameters
        ----------
        model: sklearn model, knn model

        data: movie-user matrix

        hashmap: dict, map movie title name to index of the movie in data

        fav_movie: str, name of user input movie

        n_recommendations: int, top n recommendations

        Return
        ------
        list of top n similar movie recommendations
        """
        # fit
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_movie)
        idx = self._fuzzy_matching(hashmap, fav_movie)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(
            data[idx],
            n_neighbors=n_recommendations+1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n movie recommendations

        Parameters
        ----------
        fav_movie: str, name of user input movie

        n_recommendations: int, top n recommendations
        """
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(
            self.model, movie_user_mat_sparse, hashmap,
            fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i+1, reverse_hashmap[idx], dist))


In [20]:
if __name__ == '__main__':
    # initial recommender system
    recommender = KnnRecommender(
        'MovieLens/movies.csv',
        'MovieLens/ratings.csv')
    # set params
    recommender.set_filter_params(50, 50)
    recommender.set_model_params(20, 'brute', 'cosine', -1)
    # make recommendations
    recommender.make_recommendations("Iron Man", 10)

You have input movie: Iron Man
Found possible matches in our database: ['Iron Man (2008)']

Recommendation system start to make inference
......

It took my system 0.05s to make inference 
              
Recommendations for Iron Man:
1: Kung Fu Panda (2008), with distance of 0.37368708848953247
2: Inception (2010), with distance of 0.3691744804382324
3: Up (2009), with distance of 0.3688569664955139
4: Guardians of the Galaxy (2014), with distance of 0.36875778436660767
5: Star Trek (2009), with distance of 0.36602938175201416
6: Batman Begins (2005), with distance of 0.36275893449783325
7: Avatar (2009), with distance of 0.3108932375907898
8: WALL·E (2008), with distance of 0.2981378436088562
9: Dark Knight, The (2008), with distance of 0.287839412689209
10: Avengers, The (2012), with distance of 0.2853195071220398
