In [1]:
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import linear_kernel
import numpy as np



In [2]:
data = pd.read_csv("/workspaces/IDS706-Final-Project/clean_df/clean_df.csv")

In [3]:
data = data.drop(['Unnamed: 0'],axis = 1)

In [4]:
data = data.dropna(subset=['overview','director','runtime','year']).reset_index(drop=True)

In [5]:
data = data.fillna('')

In [6]:
text_data = ['title','director','actor','overview','genres_list','key','country']
data[text_data] = data[text_data].astype(str)

In [7]:
data["key"] = data["key"].str.encode('ascii', 'ignore').str.decode('ascii')

In [8]:
def to_dummy(col,num = None):
    li = set()
    for i in range(len(data[col])):
        if num is None:
            try:
                num = len(data[col][i].split(','))
            except:
                print(data[col][i])
        for act in data[col][i].split(',')[:num]:
            li.add(act)
    li = list(li)
    for element in li:
        data[element] = data[col].astype(str).str.contains(element, case=False).astype(int)

In [10]:
dum = ['country','director','actor','genres_list','key']
for d in dum:
    if d == 'actor':
        to_dummy(d,num = 4)
    else:
        to_dummy(d)
    print(d)

director
actor
genres_list
key


### sklearn

In [11]:
X = data.drop(['id','title','director','actor', 'overview','genres_list','key','country'], axis=1)

In [12]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X.shape

In [13]:
tfidf = TfidfVectorizer(stop_words = 'english')  # initialising the TF-IDF Vector object
tfidf_matrix = tfidf.fit_transform(data['overview'])  # Constructing the TF-IDF Matrix (no. of movies x every word in vocabulary)
tfidf_matrix.shape

(4770, 20876)

In [14]:
X = hstack([X,tfidf_matrix]).toarray()

In [15]:
cosine_sim = linear_kernel(X, X)  # Constructing the Cosine Similarity Matrix (no. of movies x no. of movies)
cosine_sim.shape

(4770, 4770)

In [16]:
cosine_sim

array([[ 1.98721887e+04, -5.87808517e-01, -8.34044232e+00, ...,
        -3.93128269e+01, -4.15661109e+01, -3.94919473e+01],
       [-5.87808517e-01,  1.68925386e+04,  3.81770050e+02, ...,
        -8.22484983e+00, -2.92508296e+01, -3.80553006e+01],
       [-8.34044232e+00,  3.81770050e+02,  1.46030091e+04, ...,
        -3.31796187e+01, -1.93291333e+01, -2.80624078e+01],
       ...,
       [-3.93128269e+01, -8.22484983e+00, -3.31796187e+01, ...,
         2.03497122e+04, -1.94459951e+00, -9.92945442e+00],
       [-4.15661109e+01, -2.92508296e+01, -1.93291333e+01, ...,
        -1.94459951e+00,  8.04468799e+03,  6.57172166e+00],
       [-3.94919473e+01, -3.80553006e+01, -2.80624078e+01, ...,
        -9.92945442e+00,  6.57172166e+00,  1.22101203e+04]])

In [17]:
with open('cosine_sim.npy', 'wb') as f:
    np.save(f, cosine_sim)

In [18]:
with open('cosine_sim.npy', 'rb') as f:
    cosine_sim = np.load(f)

In [19]:
indices = pd.Series(data.index, index = data['title']).drop_duplicates()

In [20]:
# Function that inputs movie titles and outputs top 10 movies similar to it

def get_recommendations(title, cosine_sim = cosine_sim):
  idx = indices[title]
  
  sim_scores = list(enumerate(cosine_sim[idx]))  # Get the similarity scores of all movies wrt input movie
  sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
  sim_scores = sim_scores[1:11]
  
  movie_indices = [i[0] for i in sim_scores]
  
  return data['title'].iloc[movie_indices]

In [21]:
get_recommendations('The Dark Knight Rises')

65                     The Dark Knight
119                      Batman Begins
2207                         12 Rounds
186                        Bad Boys II
303                           Catwoman
95                        Interstellar
708     Maze Runner: The Scorch Trials
933                   Shanghai Knights
2721                 Seven Psychopaths
96                           Inception
Name: title, dtype: object

Bad pipe message: %s [b"\x809\xc8Q:\xe4,\xe8\xbfD\x0f\x0cxc\x12A\xf8\xb5 '\xb4\xfe\xd0\xe4:\xe4e\xe8\x83\ng\xa3\xff\xa3\x1f\xc4B\xc2\xdd\xc1\xd1r\xd7\xb5\xd1\xb7n\x00\xeaU\xde\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xc4'[\xf6D;J\x8c6\xca\xf7\xe2j&-\x17\xefx\xa5\xc4f\x9a\xa3\x07\x8ax\x00"]
Bad pipe message: %s [b"Vr\xa9\x99rr\xc9C\xd5\x10/\x8f\xe0Q8\xd41\xc6\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\

### Use pyspark

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
data_spark=spark.createDataFrame(data)

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split
from pyspark.sql.functions import lower, col
data_spark = data_spark.withColumn("overview_splitted", split(lower(col("overview")), " "))

In [None]:
word2Vec = Word2Vec(vectorSize=100, minCount=0, maxIter=100, inputCol="overview_splitted", outputCol="features")
model = word2Vec.fit(data_spark)

In [None]:
result = model.transform(data_spark)

In [None]:
result

In [None]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=[
 'popularity',
 'vote_average',
 'year'], outputCol='feature')
assembled_data=assemble.transform(data_spark)