In [1]:
import numpy as np
import pandas as pd
from src.make_ratings_matrix import ratings_mat
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from src.tune_ALS import tune_ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
specials = pd.read_pickle('data/working_corpus.pkl')
reviews = pd.read_pickle('data/working_reviews.pkl')

In [4]:
ratings_matrix = ratings_mat(specials.special, reviews)
ratings_matrix.head()

Unnamed: 0,user,special,rating
0,0,1,8
1,0,9,7
2,0,11,8
3,0,23,7
4,1,1,10


In [5]:
spark_df = spark.createDataFrame(ratings_matrix) 

In [6]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=40)

In [36]:
als_model = ALS(
    itemCol='special',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.01,
    rank=30)

In [37]:
recommender = als_model.fit(train)

In [240]:
recommender.itemFactors.show(n=5, truncate=False)

+---+-------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                 |
+---+-------------------------------------------------------------------------------------------------------------------------+
|20 |[0.34923708, 1.6664461, 0.074719876, 1.4723135, 0.22537419, 1.2435533, 2.9566746, 0.43188784, 0.8410027, 0.92549103]     |
|30 |[1.0550938, 0.0034411687, 3.1768355, 1.1541634, 2.1262522, 2.0369828, 1.5203419, 1.2501311, 0.64875495, 0.85094464]      |
|40 |[0.0, 0.042460997, 1.6783029, 0.0, 0.9817157, 0.0, 2.093574, 0.46535543, 0.45538154, 0.6615584]                          |
|50 |[0.40841138, 0.18559955, 0.010831924, 1.8808203, 0.20633054, 0.32492447, 0.27824232, 0.017448524, 0.9464973, 0.004169744]|
|90 |[0.037490472, 2.5626118, 0.015014985, 1.029662, 0.031145398, 0.3210286, 0.6271866, 0.74002796, 0.01

In [241]:
sorted(recommender.userFactors.collect())[:5]

[Row(id=0, features=[0.7320649027824402, 0.809018611907959, 0.14970767498016357, 1.9326555728912354, 0.0, 0.09880394488573074, 0.0, 0.43030181527137756, 0.0, 0.3539911210536957]),
 Row(id=1, features=[0.0, 0.1937619298696518, 1.2614012956619263, 0.576866626739502, 0.3101431727409363, 0.005964495241641998, 0.22311478853225708, 0.71251380443573, 0.3658820390701294, 1.681159257888794]),
 Row(id=3, features=[0.0, 0.0, 2.3275201320648193, 0.6012542843818665, 0.0, 0.0, 0.0, 1.3192421197891235, 0.0, 0.8234591484069824]),
 Row(id=4, features=[0.0, 0.1937619298696518, 1.2614012956619263, 0.576866626739502, 0.3101431727409363, 0.005964495241641998, 0.22311478853225708, 0.71251380443573, 0.3658820390701294, 1.681159257888794]),
 Row(id=5, features=[0.0, 0.17438572645187378, 1.1352611780166626, 0.5191799402236938, 0.2791288495063782, 0.005368045996874571, 0.20080330967903137, 0.641262412071228, 0.3292938470840454, 1.5130434036254883])]

In [242]:
recommender.userFactors.orderBy('id').select('features').show(n=5, truncate=False)

+---------------------------------------------------------------------------------------------------------------+
|features                                                                                                       |
+---------------------------------------------------------------------------------------------------------------+
|[0.7320649, 0.8090186, 0.14970767, 1.9326556, 0.0, 0.098803945, 0.0, 0.43030182, 0.0, 0.35399112]              |
|[0.0, 0.19376193, 1.2614013, 0.5768666, 0.31014317, 0.0059644952, 0.22311479, 0.7125138, 0.36588204, 1.6811593]|
|[0.0, 0.0, 2.3275201, 0.6012543, 0.0, 0.0, 0.0, 1.3192421, 0.0, 0.82345915]                                    |
|[0.0, 0.19376193, 1.2614013, 0.5768666, 0.31014317, 0.0059644952, 0.22311479, 0.7125138, 0.36588204, 1.6811593]|
|[0.0, 0.17438573, 1.1352612, 0.51917994, 0.27912885, 0.005368046, 0.20080331, 0.6412624, 0.32929385, 1.5130434]|
+---------------------------------------------------------------------------------------

In [72]:
predicted_ratings = recommender.transform(train)

In [92]:
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predicted_ratings)
rmse

3.7443359358977157

In [94]:
predicted_ratings.where(predicted_ratings.user==2052).show()

+----+-------+------+----------+
|user|special|rating|prediction|
+----+-------+------+----------+
|2052|    148|    10|       NaN|
+----+-------+------+----------+



In [52]:
def get_top_n_recs(user, n=10):
    user_subset = spark_df.where(spark_df.user == user)
    print (type(user_subset.select()))
    user_subset_recs = recommender.recommendForUserSubset(user_subset, 5)
    top = user_subset_recs.select("recommendations.special", "recommendations.rating").first()
    recs = top[0]
    score = top[1]
    return recs, score

In [60]:
#pd.DataFrame(data={'user':[3300], 'special':[7], 'rating':[9]})
spark_df.unionAll(input.select($"*", lit(""), current_timestamp.cast("long")))


Unnamed: 0,user,special,rating
0,3300,7,9


In [71]:
test_subset = spark.createDataFrame(pd.DataFrame(data={'user':[300], 'special':[50], 'rating':[9]})) 
test_recs = recommender.recommendForUserSubset(test_subset,5)
top = test_recs.select("recommendations.special", "recommendations.rating").first()
top[0]

[259, 260, 179, 15, 297]

In [53]:
rec_1, score_1 = get_top_n_recs(1)

<class 'pyspark.sql.dataframe.DataFrame'>


In [48]:
specials.loc[reviews.iloc[1][reviews.iloc[1] > 0].index]

Unnamed: 0,special,text,lem_tokens,length,nmf_primary,nmf_secondary,kmn_cluster,MyGroups
1,Hannah Gadsby Douglas,The following is the transcript of Hannah Gadb...,"[explores, autism, affect, thinking, take, lit...",57454,7,4,2,English


In [49]:
specials.loc[rec_1]

Unnamed: 0,special,text,lem_tokens,length,nmf_primary,nmf_secondary,kmn_cluster,MyGroups
1,Hannah Gadsby Douglas,The following is the transcript of Hannah Gadb...,"[explores, autism, affect, thinking, take, lit...",57454,7,4,2,English
64,Eddie Murphy Raw,After achieving fame with Saturday Night Live ...,"[released, film, version, one, live, performan...",73209,6,4,4,Black American
306,Eddie Murphy Delirious,"Filmed on August 17, 1983 at DAR Constitution ...","[thank, much, two, time, far, two, time, befor...",41268,4,6,4,Black American
304,Bill Burr Let It Go 2010,[Quirky rock music] All right. Thank you. Than...,"[very, much, thank, coming, out, here, okay, o...",39929,7,8,3,Underdogs
212,Chris Rock Bigger Blacker 1999,Ladies and gentlemen... live from the world-fa...,"[chris, rock, whats, up, new, york, there, bro...",41112,6,4,4,Black American
298,Richard Pryor Live Sunset Strip 1982,Recorded at the Circle Star Theater in San Car...,"[california, lady, live, sunset, strip, richar...",48456,5,7,4,Black American
264,Bill Hicks Revelations 1993,Recorded at the London Dominion Theatre in Nov...,"[moon, fx, howling, wolf, cut, black, slab, lo...",44231,2,7,2,English
297,Richard Pryor Live Concert 1979,"Filmed in Long Beach, California on December 1...","[motor, squeaking, brake, car, door, open, car...",51845,7,3,4,Black American
57,Adam Devine Best Time Of Our Lives,"[rock music playing] [indistinct chatter] Hey,...","[man, lady, adam, devine, loud, hey, hows, eve...",46985,8,4,3,Morbid
283,Louis C K Shameless 2007,Please welcome Louis C.K.! Thank you. Thank yo...,"[nice, hello, everybody, ya, woo, good, thanks...",47405,8,7,3,Morbid


In [248]:
score_1

[11.169456481933594,
 10.043525695800781,
 9.943931579589844,
 9.885990142822266,
 9.87508773803711,
 8.96638298034668,
 8.952198028564453,
 8.79509162902832,
 8.579977035522461,
 8.243208885192871]

In [264]:
u = 400
rec_400, score_400 = get_top_n_recs(u)

In [305]:
specials.loc[reviews.iloc[u][reviews.iloc[u] > 0].index]

Unnamed: 0,special,text,lem_tokens,length
61,Wanda Sykes Not Normal,Ladies and gentlemen... Wanda Sykes! Yes. Yes....,"[goodness, start, saying, voted, trump, came, ...",34694


In [266]:
specials.loc[rec_400]

Unnamed: 0,special,text,lem_tokens,length
244,Jim Jefferies Alcoholocaust 2010,[Horn honks] [Indistinct conversations] [scatt...,"[speaking, indistinctly, laugh, laugh, cue, st...",42062
99,Mo Amer The Vagabond,A NETFLIX ORIGINAL COMEDY SPECIAL [audience ch...,"[mo, mo, mo, mo, mo, mo, mo, chanting, speed, ...",46487
30,Dan Soder Son Of A Gary,"Announcer: Ladies and gentlemen, Dan Soder! (c...","[wha, ah, right, right, continues, thank, righ...",50394
73,Nate Bargatze The Tennessee Kid,"Please welcome my daddy, Nate Bargatze. Were d...","[guy, coming, out, appreciate, uh, im, very, e...",50699
281,Jim Jefferies I Swear God 2009,"Ladies and gentlemen, Mr. Jim Jefferies. Hey. ...","[im, australia, ive, living, uk, past, 7, 8, y...",38651
274,Tom Segura Completely Normal 2014,"Ladies and gentlemen, Tom Segura! [Cheers and ...","[hope, live, up, expectation, realized, today,...",60252
161,Brian Regan Standing Up 2007,[Rock] [crowd cheering] [man] lets give a big...,"[regan, crowd, roar, right, thank, thank, than...",25739
303,Tom Segura Mostly Stories 2016,[soft piano music plays] [Tom] I love being a ...,"[job, world, love, being, l.a., comic, bam, al...",54043
150,Stewart Lee 90s Comedian 2006,"Recorded on 10 March 2006 at Chapter Arts, Can...","[mile, davis, kind, blue, voice, please, welco...",54219
226,Jen Kirkman Just Keep Livin 2017,"Hello. I should use a microphone, really. Hi. ...","[perfect, amount, didnt, stand, here, awkwardl...",63188


In [320]:
score_400

[19.650373458862305,
 18.51471710205078,
 18.08068084716797,
 17.754409790039062,
 17.488510131835938,
 17.422710418701172,
 17.04161262512207,
 16.441139221191406,
 16.422990798950195,
 15.99198055267334]

In [316]:
u = 1780
specials.loc[reviews.iloc[u][reviews.iloc[u] > 0].index]

Unnamed: 0,special,text,lem_tokens,length
120,Iliza Shlesinger Confirmed Kills,"Chicago, are you ready? Party goblins, are you...","[thank, thank, having, id, discus, something, ...",59055


In [317]:
rec_1780, score_1780 = get_top_n_recs(u)

In [318]:
specials.loc[rec_1780]

Unnamed: 0,special,text,lem_tokens,length
112,Daniel Sloss Jigsaw,[audience cheering] [announcer] Ladies and gen...,"[up, show, not, joke, question, id, answer, ho...",50170
267,Doug Stanhope Deadbeat Hero 2004,Liberty\n1a.The condition of being free from r...,"[act, believe, express, oneself, manner, one, ...",57711
119,Rowan Atkinson Live 1992,"Filmed in Boston, Massachusetts, at the Huntin...","[rowan, atkinson, performing, series, comedy, ...",29935
115,Bert Kreischer Secret Time,[applause] [male presenter] Ladies and gentlem...,"[applauding, crowd, louder, oh, yes, crowd, ye...",44408
297,Richard Pryor Live Concert 1979,"Filmed in Long Beach, California on December 1...","[distant, car, motor, squeaking, brake, car, d...",51845
284,Bill Burr Im Sorry Feel Way 2014,"[cheers and applause] All right, thank you! Th...","[going, thank, pleasure, here, greater, atlant...",63558
209,Pablo Francisco Ouch Live San Jose 2006,Are you ready? Brokeba...homies. Thats good. I...,"[right, here, go, thats, three, alright, hey, ...",34026
95,Jeff Foxworthy Totally Committed,"Ladies and gentlemen, please welcome Jeff Foxw...","[cincinnati, special, city, special, night, sp...",40068
150,Stewart Lee 90s Comedian 2006,"Recorded on 10 March 2006 at Chapter Arts, Can...","[mile, davis, kind, blue, voice, please, welco...",54219
154,Chris Rock Bring Pain,Live from the Takoma Theatre in Washington D.C...,"[give, mr., chris, rock, washington, d.c., cho...",41563


In [319]:
score_1780

[3.927140951156616,
 3.460350275039673,
 3.1687426567077637,
 3.143021821975708,
 3.0785598754882812,
 3.051854372024536,
 3.0238850116729736,
 2.9829797744750977,
 2.9790704250335693,
 2.9406094551086426]

In [323]:
u = 2100
specials.loc[reviews.iloc[u][reviews.iloc[u] > 0].index]

Unnamed: 0,special,text,lem_tokens,length
152,Dave Attell Road Work,[Cheers and applause] you guys ready to meet ...,"[people, little, noise, here, cheer, headliner...",32799


In [324]:
rec_2100, score_2100 = get_top_n_recs(u)
specials.loc[rec_2100]

Unnamed: 0,special,text,lem_tokens,length
264,Bill Hicks Revelations 1993,Recorded at the London Dominion Theatre in Nov...,"[cut, moon, fx, howling, wolf, cut, black, sla...",44231
27,Stewart Lee Carpet Remnant World,(70s GERMAN ROCK MUSIC PLAYING) ANNOUNCER: Lad...,"[stewart, lee, applauding, bit, heavy, metal, ...",83784
274,Tom Segura Completely Normal 2014,"Ladies and gentlemen, Tom Segura! [Cheers and ...","[hope, live, up, expectation, realized, today,...",60252
266,Doug Stanhope Turning Gun 2012,Before Turning the Gun on Himself is the eight...,"[released, november, 6, roadrunner, record, re...",47382
115,Bert Kreischer Secret Time,[applause] [male presenter] Ladies and gentlem...,"[applauding, crowd, louder, oh, yes, crowd, ye...",44408
303,Tom Segura Mostly Stories 2016,[soft piano music plays] [Tom] I love being a ...,"[job, world, love, being, l.a., comic, bam, al...",54043
119,Rowan Atkinson Live 1992,"Filmed in Boston, Massachusetts, at the Huntin...","[rowan, atkinson, performing, series, comedy, ...",29935
169,Stewart Lee Standup Comedian,"Recorded on 10 March 2005 at The Stand, Glasgo...","[evan, parker, stewart, appears, music, change...",59705
112,Daniel Sloss Jigsaw,[audience cheering] [announcer] Ladies and gen...,"[up, show, not, joke, question, id, answer, ho...",50170
204,Patton Oswalt Annihilation 2017,Comedian Patton Oswalt returns to Netflix with...,"[powerful, stand-up, special, dive, last, year...",48151


In [330]:
score_2100

[2.5717837810516357,
 2.5296874046325684,
 2.3819077014923096,
 2.2651827335357666,
 2.153329849243164,
 2.14890193939209,
 2.0979442596435547,
 2.094550132751465,
 2.0814552307128906,
 2.0731136798858643]

In [331]:
ratings_matrix[ratings_matrix.user==2100]

Unnamed: 0,user,special,rating
2759,2100,152,2
