In [1]:
from surprise.dataset import Reader
from surprise import SVD
from surprise import Dataset
from surprise import dump

from collections import defaultdict
import pandas as pd

In [2]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# DONT RUN THESE AGAIN 

In [3]:
#this reads the train set
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
filepath = 'train_ratings.csv'
data = Dataset.load_from_file(filepath, reader=reader)
trainset = data.build_full_trainset()

In [4]:
# First train an SVD algorithm on the full dataset.
algo = SVD()
algo.train(trainset)




In [15]:


dump.dump('CF_full_fit.pkl', algo=algo)

# Start Here

In [5]:
#reads in the testset
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
filepath_test = 'collab_test_ratings.csv'
data = Dataset.load_from_file(filepath_test, reader=reader)



In [6]:
#temperary step to build the testset
_ = data.build_full_trainset()

In [7]:
#this is the actual test set
testset = _.build_anti_testset()

In [8]:
__, loaded_algo = dump.load('CF_full_fit.pkl')



In [10]:
# We now ensure that the algo is still the same by checking the predictions.
predictions = loaded_algo.test(testset)

In [11]:
# filters predictions to the top 10
top_n = get_top_n(predictions, n=10)

In [12]:
df2 = pd.DataFrame().from_dict(top_n, orient='index')


In [16]:
df2.to_csv('top10_users.csv')

In [11]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
60,"(82, 4.76144546839)","(1216, 4.75925738403)","(54190, 4.75363635523)","(1172, 4.74032341364)","(7669, 4.72762369289)","(59387, 4.71401893956)","(87308, 4.70888736644)","(4973, 4.67009398763)","(2686, 4.66541648749)","(64034, 4.64303660831)"
198,"(5767, 4.68374754903)","(1080, 4.6186488884)","(66934, 4.61086350955)","(318, 4.60622884494)","(665, 4.59285637343)","(104069, 4.56650356715)","(1202, 4.56557128667)","(858, 4.54589867984)","(1136, 4.51519807534)","(31359, 4.51332941718)"
211,"(2324, 4.63888324971)","(116797, 4.61706171986)","(92259, 4.60663971857)","(27523, 4.55160732108)","(91529, 4.53849521653)","(27815, 4.53539217627)","(73881, 4.52514617194)","(72998, 4.52028458813)","(1797, 4.46864322521)","(88810, 4.46775340424)"
325,"(4993, 4.90320428575)","(7153, 4.86141132778)","(5952, 4.83191611631)","(318, 4.74999105717)","(79132, 4.66511341392)","(527, 4.65065235766)","(3578, 4.64052217904)","(109487, 4.63843238315)","(2028, 4.61248773355)","(2571, 4.58689616013)"
411,"(1, 4.58232073813)","(91233, 4.58051815827)","(527, 4.48260228142)","(3114, 4.47769567084)","(8827, 4.47414513186)","(318, 4.45758930736)","(1148, 4.44653278145)","(4896, 4.44525753549)","(150, 4.44236570367)","(78499, 4.41820764851)"


In [13]:
df2.shape

(1385, 10)

In [17]:
df_read = pd.read_csv('top10_users.csv')

In [18]:
df_read

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,60,"('82', 4.7614454683856051)","('1216', 4.7592573840297758)","('54190', 4.7536363552273153)","('1172', 4.740323413640203)","('7669', 4.7276236928911892)","('59387', 4.7140189395605114)","('87308', 4.7088873664355653)","('4973', 4.6700939876344592)","('2686', 4.6654164874925375)","('64034', 4.6430366083137624)"
1,198,"('5767', 4.6837475490285696)","('1080', 4.6186488884024772)","('66934', 4.6108635095522805)","('318', 4.6062288449357025)","('665', 4.5928563734261001)","('104069', 4.5665035671490193)","('1202', 4.5655712866651328)","('858', 4.5458986798378396)","('1136', 4.5151980753425516)","('31359', 4.5133294171805947)"
2,211,"('2324', 4.6388832497053087)","('116797', 4.6170617198591088)","('92259', 4.6066397185668482)","('27523', 4.5516073210824999)","('91529', 4.5384952165332137)","('27815', 4.5353921762695464)","('73881', 4.5251461719390305)","('72998', 4.5202845881334719)","('1797', 4.4686432252102346)","('88810', 4.4677534042440614)"
3,325,"('4993', 4.9032042857481111)","('7153', 4.8614113277826059)","('5952', 4.8319161163059894)","('318', 4.74999105716687)","('79132', 4.6651134139202854)","('527', 4.6506523576605296)","('3578', 4.6405221790388369)","('109487', 4.6384323831491114)","('2028', 4.6124877335532277)","('2571', 4.5868961601344438)"
4,411,"('1', 4.5823207381265822)","('91233', 4.58051815826673)","('527', 4.4826022814171953)","('3114', 4.4776956708373579)","('8827', 4.4741451318646712)","('318', 4.4575893073595125)","('1148', 4.4465327814477584)","('4896', 4.4452575354865189)","('150', 4.4423657036678215)","('78499', 4.4182076485139552)"
5,527,"('1900', 4.6228528014505725)","('214', 4.6064703745041236)","('7327', 4.5847123374038983)","('326', 4.579732874230273)","('97', 4.5765724241504788)","('5368', 4.5442653661634766)","('1178', 4.5400207078675416)","('6669', 4.5265956810199643)","('7767', 4.5225997827596247)","('541', 4.5205509454222401)"
6,595,"('946', 4.7791213730638189)","('1198', 4.760666852096648)","('1196', 4.6937553918012789)","('955', 4.6656705389017379)","('26903', 4.6537643688328014)","('7215', 4.6447645002170646)","('1270', 4.6395152211918216)","('1291', 4.6343506495897406)","('26133', 4.6207702987678934)","('1281', 4.5834263924803045)"
7,647,"('92535', 4.431162618296228)","('1221', 4.3978661216242863)","('26082', 4.2763023958422011)","('750', 4.2739325672366233)","('2019', 4.2717857266721122)","('86345', 4.2705049483835884)","('1201', 4.2678753804151981)","('6016', 4.2578805931574246)","('1213', 4.2383082448850109)","('1089', 4.2365437870029643)"
8,712,"('27611', 4.1953332617592931)","('4993', 4.1388636585933849)","('1196', 4.1169251991666158)","('7153', 4.1055737402951715)","('5952', 4.0967459895244351)","('318', 4.0891537259560078)","('260', 4.08624036745986)","('8638', 4.0671018109162835)","('87960', 4.0307413627374036)","('2905', 4.0293854103355757)"
9,926,"('1148', 4.9157940092790628)","('720', 4.8875478148762816)","('745', 4.8566202811839139)","('1223', 4.754781544806244)","('2019', 4.7180047239842233)","('50742', 4.7073343453208265)","('2920', 4.6988673714951048)","('3338', 4.6851041375136031)","('668', 4.6806324822206751)","('25750', 4.6768234055997286)"


# Now lets do that 10x

In [4]:
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)

In [5]:
__, loaded_algo = dump.load('CF_full_fit.pkl')

In [6]:


for iteration in range(100):
    #get the data
    print(iteration, ": Loading block")
    filename = 'Batches/' + 'block' + str(iteration) + '.csv'
    data = Dataset.load_from_file(filename, reader=reader)
    
    #temperary step to build the testset
    print(iteration, ": processing testset")
    _ = data.build_full_trainset()
    #this is the actual test set
    testset = _.build_anti_testset()
    
    # We now ensure that the algo is still the same by checking the predictions.
    print(iteration, ": Predictions happening")
    predictions = loaded_algo.test(testset)
    
    # filters predictions to the top 10
    print(iteration, ": Getting top 10")
    top_n = get_top_n(predictions, n=10)
    
    df2 = pd.DataFrame().from_dict(top_n, orient='index')
    
    outputname = 'Batches/' +'output_' + 'block' + str(iteration) + '.csv'
    df2.to_csv(outputname)

    print(iteration, ": Finished!")

0 : Loading block
0 : processing testset
0 : Predictions happening
0 : Getting top 10
0 : Finished!
1 : Loading block
1 : processing testset
1 : Predictions happening
1 : Getting top 10
1 : Finished!
2 : Loading block
2 : processing testset
2 : Predictions happening
2 : Getting top 10
2 : Finished!
3 : Loading block
3 : processing testset
3 : Predictions happening
3 : Getting top 10
3 : Finished!
4 : Loading block
4 : processing testset
4 : Predictions happening
4 : Getting top 10
4 : Finished!
5 : Loading block
5 : processing testset
5 : Predictions happening
5 : Getting top 10
5 : Finished!
6 : Loading block
6 : processing testset
6 : Predictions happening
6 : Getting top 10
6 : Finished!
7 : Loading block
7 : processing testset
7 : Predictions happening
7 : Getting top 10
7 : Finished!
8 : Loading block
8 : processing testset
8 : Predictions happening
8 : Getting top 10
8 : Finished!
9 : Loading block
9 : processing testset
9 : Predictions happening
9 : Getting top 10
9 : Finished!


OSError: [Errno 28] No space left on device

In [9]:
# ERROR AT INDEX 30

In [6]:
# start at 30 now

for iteration in range(91, 100):
    #get the data
    print(iteration, ": Loading block")
    filename = 'Batches/' + 'block' + str(iteration) + '.csv'
    data = Dataset.load_from_file(filename, reader=reader)
    
    #temperary step to build the testset
    print(iteration, ": processing testset")
    _ = data.build_full_trainset()
    #this is the actual test set
    testset = _.build_anti_testset()
    
    # We now ensure that the algo is still the same by checking the predictions.
    print(iteration, ": Predictions happening")
    predictions = loaded_algo.test(testset)
    
    # filters predictions to the top 10
    print(iteration, ": Getting top 10")
    top_n = get_top_n(predictions, n=10)
    
    df2 = pd.DataFrame().from_dict(top_n, orient='index')
    
    outputname = 'Batches/' +'output_' + 'block' + str(iteration) + '.csv'
    df2.to_csv(outputname)

    print(iteration, ": Finished!")

91 : Loading block
91 : processing testset
91 : Predictions happening
91 : Getting top 10
91 : Finished!
92 : Loading block
92 : processing testset
92 : Predictions happening
92 : Getting top 10
92 : Finished!
93 : Loading block
93 : processing testset
93 : Predictions happening
93 : Getting top 10
93 : Finished!
94 : Loading block
94 : processing testset
94 : Predictions happening
94 : Getting top 10
94 : Finished!
95 : Loading block
95 : processing testset
95 : Predictions happening
95 : Getting top 10
95 : Finished!
96 : Loading block
96 : processing testset
96 : Predictions happening
96 : Getting top 10
96 : Finished!
97 : Loading block
97 : processing testset
97 : Predictions happening
97 : Getting top 10
97 : Finished!
98 : Loading block
98 : processing testset
98 : Predictions happening
98 : Getting top 10
98 : Finished!
99 : Loading block
99 : processing testset
99 : Predictions happening
99 : Getting top 10
99 : Finished!


In [7]:
# so block0 is missing somehow

iteration = 0

#get the data
print(iteration, ": Loading block")
filename = 'Batches/' + 'block' + str(iteration) + '.csv'
data = Dataset.load_from_file(filename, reader=reader)

#temperary step to build the testset
print(iteration, ": processing testset")
_ = data.build_full_trainset()
#this is the actual test set
testset = _.build_anti_testset()

# We now ensure that the algo is still the same by checking the predictions.
print(iteration, ": Predictions happening")
predictions = loaded_algo.test(testset)

# filters predictions to the top 10
print(iteration, ": Getting top 10")
top_n = get_top_n(predictions, n=10)

df2 = pd.DataFrame().from_dict(top_n, orient='index')

outputname = 'Batches/' +'output_' + 'block' + str(iteration) + '.csv'
df2.to_csv(outputname)

print(iteration, ": Finished!")


0 : Loading block
0 : processing testset
0 : Predictions happening
0 : Getting top 10
0 : Finished!


In [None]:
# another sidd note, this code ran from 0-90 on an EC2 t2.XLarge instance. it crashed at 91.

In [12]:
for x in range(3, 10):
    print(x)

3
4
5
6
7
8
9
