# __Importing libraries__ 

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer    
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing
from scipy.sparse import hstack
import pandas_profiling

# __Data preprocessing__

In [3]:
# Reading Data
df = pd.read_csv("MoviesOnStreamingPlatforms.csv")
df = df.iloc[:,1:]

In [4]:
# Finding Missing values in all columns
miss = pd.DataFrame(df.isnull().sum())
miss = miss.rename(columns={0:"miss_count"})
miss["miss_%"] = (miss.miss_count/len(df.ID))*100
miss

Unnamed: 0,miss_count,miss_%
ID,0,0.0
Title,0,0.0
Year,0,0.0
Age,9390,56.07979
IMDb,571,3.410177
Rotten Tomatoes,11586,69.194935
Netflix,0,0.0
Hulu,0,0.0
Prime Video,0,0.0
Disney+,0,0.0


In [5]:
#Dropping values with missing % more than 50%
df.drop(['Rotten Tomatoes', 'Age'], axis = 1, inplace=True)

# Dropping Na's from the following columns
df.dropna(subset=['IMDb','Directors', 'Genres', 'Country', 'Language', 'Runtime'],inplace=True)
df.reset_index(inplace=True,drop=True)

# converting into object type
df.ID = df.ID.astype("object")
df.Year = df.Year.astype("object")

# __Numerical-variables recommender__

### Step-1: Select the numerical variable

In [6]:
ndf = df.select_dtypes(include=['float64', "int64"])

### Step-2: Scaling the numerical variable using a min-max scaler to reduce model complexity and training time

In [7]:
# importing minmax scaler
from sklearn import preprocessing

# Create MinMaxScaler Object
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Create dataframe after transformation
ndfmx = pd.DataFrame((scaler.fit_transform(ndf)))

# Assign column names
ndfmx.columns = ndf.columns

# Show initial 5 records
ndfmx.head()

Unnamed: 0,IMDb,Netflix,Hulu,Prime Video,Disney+,Type,Runtime
0,0.946237,1.0,0.0,0.0,0.0,0.0,0.449541
1,0.935484,1.0,0.0,0.0,0.0,0.0,0.412844
2,0.913978,1.0,0.0,0.0,0.0,0.0,0.452599
3,0.913978,1.0,0.0,0.0,0.0,0.0,0.351682
4,0.946237,1.0,0.0,1.0,0.0,0.0,0.489297


### Step-3: Compute similarity score using cosine similarity

In [8]:
# Import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity
sig = cosine_similarity(ndfmx, ndfmx)

# Reverse mapping of indices and movie titles
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

In [9]:
indices.head()

Title
Inception                         0
The Matrix                        1
Avengers: Infinity War            2
Back to the Future                3
The Good, the Bad and the Ugly    4
dtype: int64

### Step-4: Writing a function to get recommendations based on the similarity score 

In [10]:
def give_rec(title, sig=sig):

    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwise similarity scores
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return df['Title'].iloc[movie_indices]

In [12]:
# Execute get_rec() function for getting recommendation
give_rec("Inception", sig = sig)

2        Avengers: Infinity War
1528             Love Ni Bhavai
306     Springsteen On Broadway
1                    The Matrix
1269     Eh Janam Tumhare Lekhe
2388                     Kaakan
1122                Punjab 1984
9          Inglourious Basterds
1534            Sillu Karupatti
747           Khosla Ka Ghosla!
Name: Title, dtype: object

# __All-variables recommender__

In [14]:
df.columns

Index(['ID', 'Title', 'Year', 'IMDb', 'Netflix', 'Hulu', 'Prime Video',
       'Disney+', 'Type', 'Directors', 'Genres', 'Country', 'Language',
       'Runtime'],
      dtype='object')

### Step-1: Features preparation

In [15]:
# This function performs all the important preprocessing steps
def preprocess(df):
    ##### combining all text columns
    # Selecting all object data type and storing them in list
    s = list(df.select_dtypes(include=['object']).columns)
    
    # Removing ID and Title column
    s.remove("Title")
    s.remove("ID")
    
    # Joining all text/object columns using commas into a single column
    df['all_text']= df[s].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

    # Creating a tokenizer to remove unwanted elements from our data like symbols and numbers
    token = RegexpTokenizer(r'[a-zA-Z]+')

    # Converting TfidfVector from the text
    cv = TfidfVectorizer(lowercase=True, 
                         stop_words='english',
                         ngram_range = (1,1),
                         tokenizer = token.tokenize)
    text_counts = cv.fit_transform(df['all_text'])

    
    
    ##### Selecting numerical variables
    ndf = df.select_dtypes(include=['float64',"int64"])

    # Scaling Numerical variables
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

    # Applying scaler on our data and converting it into a data frame
    ndfmx = pd.DataFrame((scaler.fit_transform(ndf)))
    ndfmx.columns = ndf.columns    

    # Adding our numerical variables in the TF-IDF vector
    IMDb = ndfmx.IMDb.values[:, None]
    X_train_dtm = hstack((text_counts, IMDb))
    
    Netflix = ndfmx.Netflix.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Netflix))
    
    Hulu = ndfmx.Hulu.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Hulu))
    
    Prime = ndfmx["Prime Video"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, Prime))
    
    Disney = ndfmx["Disney+"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, Disney))
    
    Runtime = ndfmx.Runtime.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Runtime))
    
    
    return X_train_dtm

### Step-2: Applying the function to our data and creating a sparse matrix

In [16]:
# Preprocessing data
mat = preprocess(df)
mat.shape

(15233, 11997)

### Step-3: Applying Cosine Similarity to compute the similarity score

In [17]:
# using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute the sigmoid kernel
sig2 = cosine_similarity(mat, mat)

# Reverse mapping of indices and movie titles
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

### Step-4: Getting a recommendation from the improved system

In [19]:
give_rec("Inception", sig = sig2)

922                   Get Santa
132     Solo: A Star Wars Story
517                   Studio 54
3            Back to the Future
976                    Godzilla
404        Terminator Salvation
25                   Ex Machina
242     What Happened to Monday
776    Resident Evil: Afterlife
47                        Senna
Name: Title, dtype: object