In [1]:
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import linear_kernel
import numpy as np



In [2]:
data = pd.read_csv("./clean_df/clean_df.csv")

In [3]:
data = data.drop(['Unnamed: 0'],axis = 1)

In [4]:
data = data.dropna(subset=['overview','director','runtime','year']).reset_index(drop=True)

In [5]:
data = data.fillna('')

In [6]:
text_data = ['title','director','actor','overview','genres_list','key','country']
data[text_data] = data[text_data].astype(str)

In [7]:
data["key"] = data["key"].str.encode('ascii', 'ignore').str.decode('ascii')

In [8]:
def to_dummy(col,num = None):
    li = set()
    for i in range(len(data[col])):
        if num is None:
            try:
                num = len(data[col][i].split(','))
            except:
                print(data[col][i])
        for act in data[col][i].split(',')[:num]:
            li.add(act)
    li = list(li)
    for element in li:
        data[element] = data[col].astype(str).str.contains(element, case=False).astype(int)

In [9]:
dum = ['country','director','actor','genres_list','key']
for d in dum:
    if d == 'actor':
        to_dummy(d,num = 4)
    else:
        to_dummy(d)
    print(d)

country
director
actor
genres_list
key


### sklearn

In [10]:
X = data.drop(['id','title','director','actor', 'overview','genres_list','key','country'], axis=1)

In [11]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [12]:
X.shape

(4770, 19229)

In [13]:
tfidf = TfidfVectorizer(stop_words = 'english')  # initialising the TF-IDF Vector object
tfidf_matrix = tfidf.fit_transform(data['overview'])  # Constructing the TF-IDF Matrix (no. of movies x every word in vocabulary)
tfidf_matrix.shape

(4770, 20876)

In [14]:
X = hstack([X,tfidf_matrix]).toarray()

In [15]:
cosine_sim = linear_kernel(X, X)  # Constructing the Cosine Similarity Matrix (no. of movies x no. of movies)
cosine_sim.shape

(4770, 4770)

In [16]:
cosine_sim

array([[ 1.97855178e+04, -4.13424622e+00, -8.65318132e+00, ...,
        -4.06511417e+01, -4.08878824e+01, -3.98424776e+01],
       [-4.13424622e+00,  1.81518036e+04,  3.78242191e+02, ...,
        -1.27782841e+01, -3.17877205e+01, -4.16209502e+01],
       [-8.65318132e+00,  3.78242191e+02,  1.48416500e+04, ...,
        -3.44993543e+01, -1.86323256e+01, -2.83943588e+01],
       ...,
       [-4.06511417e+01, -1.27782841e+01, -3.44993543e+01, ...,
         2.07646290e+04, -2.27336759e+00, -1.12869813e+01],
       [-4.08878824e+01, -3.17877205e+01, -1.86323256e+01, ...,
        -2.27336759e+00,  8.04637577e+03,  7.23073814e+00],
       [-3.98424776e+01, -4.16209502e+01, -2.83943588e+01, ...,
        -1.12869813e+01,  7.23073814e+00,  9.97874978e+03]])

In [17]:
with open('cosine_sim.npy', 'wb') as f:
    np.save(f, cosine_sim)

In [18]:
with open('cosine_sim.npy', 'rb') as f:
    cosine_sim = np.load(f)

In [19]:
indices = pd.Series(data.index, index = data['title']).drop_duplicates()

In [20]:
# Function that inputs movie titles and outputs top 10 movies similar to it

def get_recommendations(title, cosine_sim = cosine_sim):
  idx = indices[title]
  
  sim_scores = list(enumerate(cosine_sim[idx]))  # Get the similarity scores of all movies wrt input movie
  sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
  sim_scores = sim_scores[1:11]
  
  movie_indices = [i[0] for i in sim_scores]
  
  return data['title'].iloc[movie_indices]

In [21]:
get_recommendations('The Dark Knight Rises')

65                     The Dark Knight
119                      Batman Begins
2207                         12 Rounds
186                        Bad Boys II
303                           Catwoman
95                        Interstellar
708     Maze Runner: The Scorch Trials
2721                 Seven Psychopaths
933                   Shanghai Knights
96                           Inception
Name: title, dtype: object

### Use pyspark

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
data_spark=spark.createDataFrame(data)

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split
from pyspark.sql.functions import lower, col
data_spark = data_spark.withColumn("overview_splitted", split(lower(col("overview")), " "))

In [None]:
word2Vec = Word2Vec(vectorSize=100, minCount=0, maxIter=100, inputCol="overview_splitted", outputCol="features")
model = word2Vec.fit(data_spark)

In [None]:
result = model.transform(data_spark)

In [None]:
result

In [None]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=[
 'popularity',
 'vote_average',
 'year'], outputCol='feature')
assembled_data=assemble.transform(data_spark)