In [1]:
import numpy as np
import pandas as pd
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNWithMeans, KNNBasic, KNNBaseline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [2]:
spark = SparkSession.builder.master('local').getOrCreate() 

In [3]:
ratings = spark.read.json('data/ratings.json')#, schema=schema)

In [4]:
ratings.persist()
ratings.dtypes

[('movie_id', 'bigint'),
 ('rating', 'bigint'),
 ('timestamp', 'double'),
 ('user_id', 'bigint')]

In [5]:
# als = ALS(maxIter=10,
#           rank=10,
#           userCol="user_id", 
#           itemCol="movie_id", 
#           ratingCol="rating")

#model = als.fit(ratings)

In [6]:
# predictions = model.transform(ratings)
# predictions.persist()
# evaluator = RegressionEvaluator(metricName='rmse',
#                                labelCol='rating',
#                                predictionCol='prediction')
# rmse = evaluator.evaluate(predictions)
# print("Root-mean-square error = " + str(rmse))

In [7]:
# predictions.persist()
# predictions.show(5)

In [41]:
requests = spark.read.json('data/requests.json')
requests.dtypes

[('movie_id', 'bigint'),
 ('rating', 'double'),
 ('timestamp', 'double'),
 ('user_id', 'bigint')]

In [9]:
#requests.show(5)

In [10]:
#req_predict = model.transform(requests)

In [11]:
#req_predict.show(5)

In [12]:
# final_requests = req_predict.drop('rating')
# final_requests = final_requests.withColumnRenamed('prediction','rating')
# final_requests.show(5)

In [6]:
movie_titles = spark.read.csv('data/movies.dat', sep=":")
movie_titles = movie_titles.drop('_c1','_c3')
movie_titles.persist()
#movie_titles.head(5)

DataFrame[_c0: string, _c2: string, _c4: string]

In [7]:
users = spark.read.csv('data/users.dat', sep=':')
users = users.drop('_c1', '_c3', '_c5', '_c7')
users.persist()
#users.show(5)

DataFrame[_c0: string, _c2: string, _c4: string, _c6: string, _c8: string]

In [8]:
old_cols = ['_c0','_c2','_c4', '_c6', '_c8']
new_cols = ['id','gender','age_group','occupation','zipcode']

def rename_cols(new_cols,old_cols,data):
    for i in range(len(old_cols)):
        data = data.withColumnRenamed(old_cols[i],new_cols[i])
    return data

users = rename_cols(new_cols,old_cols,users)
users.show(5)

+---+------+---------+----------+-------+
| id|gender|age_group|occupation|zipcode|
+---+------+---------+----------+-------+
|  1|     F|        1|        10|  48067|
|  2|     M|       56|        16|  70072|
|  3|     M|       25|        15|  55117|
|  4|     M|       45|         7|  02460|
|  5|     M|       25|        20|  55455|
+---+------+---------+----------+-------+
only showing top 5 rows



In [9]:
new = ['id','title','genre']
old = ['_c0', '_c2','_c4']

movie_titles = rename_cols(new,old,movie_titles)
movie_titles.show(5)

+---+--------------------+--------------------+
| id|               title|               genre|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Animation|Childre...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|        Comedy|Drama|
|  5|Father of the Bri...|              Comedy|
+---+--------------------+--------------------+
only showing top 5 rows



In [10]:
fulldata = ratings.join(movie_titles, ratings.movie_id == movie_titles.id)
fulldata = fulldata.join(users, fulldata.user_id == users.id)
fulldata = fulldata.drop('id')
fulldata = fulldata.dropna()
fulldata.persist()
#fulldata.show(5)

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint, title: string, genre: string, gender: string, age_group: string, occupation: string, zipcode: string]

In [11]:
newdata = fulldata[['rating','timestamp','genre','gender','age_group','occupation']]
#newdata.show(5)

In [34]:
onehot = OneHotEncoder()
cols = ['genre','gender','age_group','occupation']
to_encode = newdata[cols].toPandas()

In [35]:
onehot.fit(to_encode)
new_cols = onehot.get_feature_names(cols)
ohe_data = onehot.transform(to_encode).toarray()

In [36]:
col_names = ['timestamp'] + list(new_cols)
ohe_data = np.concatenate((newdata[['timestamp']].toPandas(),ohe_data),axis=1)
X = pd.DataFrame(ohe_data,columns=col_names)
X.head()

Unnamed: 0,timestamp,genre_ Miami Beach (1988),genre_Action,genre_Action|Adventure,genre_Action|Adventure|Animation,genre_Action|Adventure|Animation|Children's|Fantasy,genre_Action|Adventure|Animation|Horror|Sci-Fi,genre_Action|Adventure|Children's|Comedy,genre_Action|Adventure|Children's|Fantasy,genre_Action|Adventure|Children's|Sci-Fi,...,occupation_19,occupation_2,occupation_20,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9
0,956678732.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,956678754.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,956678777.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,956678856.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,956678856.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
y = newdata[['rating']].toPandas()
y = y['rating']
y.shape

(673236,)

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=123)

# ss = StandardScaler()
# ss.fit(X_train)
# X_train_s = ss.transform(X_train)
# X_test_s = ss.transform(X_test)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
test_preds = clf.predict(X_test)

print(test_preds)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score


def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds, average=None)))
    print("Recall Score: {}".format(recall_score(labels, preds, average=None)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds, average=None)))

print_metrics(y_test, test_preds) 

[4 3 4 ... 4 5 2]
Precision Score: [0.23663266 0.19942104 0.32411791 0.40355271 0.38828072]
Recall Score: [0.23434725 0.17556776 0.37956604 0.44011671 0.28198729]
Accuracy Score: 0.34906630067316663
F1 Score: [0.23548441 0.18673574 0.3496574  0.42104238 0.32670582]


In [42]:
fullreqs = requests.join(movie_titles, requests.movie_id == movie_titles.id)
fullreqs = fullreqs.join(users, fullreqs.user_id == users.id)
fullreqs = fullreqs.drop('id')
#fullreqs = fullreqs.dropna()
df_fullreqs = fullreqs.toPandas()
df_fullreqs = df_fullreqs[~df_fullreqs['genre'].isna()]
#fullreqs.persist()
df_fullreqs.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,title,genre,gender,age_group,occupation,zipcode
0,2019,,956678777.0,6040,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama,M,25,6,11106
2,2858,,956679275.0,6040,American Beauty (1999),Comedy|Drama,M,25,6,11106
3,246,,956679413.0,6040,Hoop Dreams (1994),Documentary,M,25,6,11106
4,1617,,956679473.0,6040,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller,M,25,6,11106
5,2324,,956679629.0,6040,Life Is Beautiful (La Vita � bella) (1997),Comedy|Drama,M,25,6,11106


In [37]:
enc_reqs = df_fullreqs[cols]

ohe_data = onehot.transform(enc_reqs).toarray()
new_cols = onehot.get_feature_names(cols)
col_names = ['timestamp'] + list(new_cols)
ohe_data = np.concatenate((df_fullreqs[['timestamp']],ohe_data),axis=1)
X = pd.DataFrame(ohe_data,columns=col_names)
X.head()

Unnamed: 0,timestamp,genre_ Miami Beach (1988),genre_Action,genre_Action|Adventure,genre_Action|Adventure|Animation,genre_Action|Adventure|Animation|Children's|Fantasy,genre_Action|Adventure|Animation|Horror|Sci-Fi,genre_Action|Adventure|Children's|Comedy,genre_Action|Adventure|Children's|Fantasy,genre_Action|Adventure|Children's|Sci-Fi,...,occupation_19,occupation_2,occupation_20,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9
0,956678777.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,956679275.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,956679413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,956679473.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,956679629.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [43]:
#X_scaled = ss.transform(X)
preds = clf.predict(X)
requests = requests.toPandas()
requests = requests[~fullreqs.toPandas()['genre'].isna()]
requests['rating'] = preds
requests.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,4,956678777.0,6040
2,2858,4,956679275.0,6040
3,246,4,956679413.0,6040
4,1617,4,956679473.0,6040
5,2324,2,956679629.0,6040


In [39]:
requests.w

(263295,)