In [20]:
# basics
import argparse
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

# our code
import linear_model
import utils

url_amazon = "https://www.amazon.com/dp/%s"

def load_dataset(filename):
    with open(os.path.join('..','data',filename), 'rb') as f:
        return pickle.load(f)
    
filename = "ratings_Patio_Lawn_and_Garden.csv"

with open(os.path.join("..", "data", filename), "rb") as f:
            ratings = pd.read_csv(f,names=("user","item","rating","timestamp"))
print("Number of ratings:", len(ratings))
print("The average rating:", np.mean(ratings["rating"]))

n = len(set(ratings["user"]))
d = len(set(ratings["item"]))
print("Number of users:", n)
print("Number of items:", d)
print("Fraction nonzero:", len(ratings)/(n*d))

X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = utils.create_user_item_matrix(ratings)
print(type(X))
print("Dimensions of X:", X.shape)

Number of ratings: 993490
The average rating: 4.006400668350965
Number of users: 714791
Number of items: 105984
Fraction nonzero: 1.3114269915944552e-05
<class 'scipy.sparse.csr.csr_matrix'>
Dimensions of X: (714791, 105984)


In [2]:
user_mapper

{'A0001528BGUBOEVR6T5U': 0,
 'A00066383LNA9T6M0S9ZW': 1,
 'A00070823AX8NX792V4RL': 2,
 'A0009478CBXKUCALUC7U': 3,
 'A001211614UB7M3DVLGSE': 4,
 'A001269818K3KT1NIR3O6': 5,
 'A001573229XK5T8PI0OKA': 6,
 'A00181843LH73EH2Y9CV2': 7,
 'A00188842369XRWY4HKOV': 8,
 'A00217381F34RIAAUFPCH': 9,
 'A002552825SX31CD1Q186': 10,
 'A00259242VSCRZPGIWP0M': 11,
 'A002727434Z0KOOT6LR9Z': 12,
 'A002759211C416DRR5RKE': 13,
 'A002764429D4LF2SJIT18': 14,
 'A0029274J35Q1MYNKUWO': 15,
 'A00311601FOXCO2AIRNIF': 16,
 'A003267413RXJJ2LHCLHA': 17,
 'A00328401T70RFN4P1IT6': 18,
 'A003322213RZ15CXJBET2': 19,
 'A00339243ILJT7ZBM764R': 20,
 'A003658417580D9H97XL6': 21,
 'A0038640S18JE5Y497U6': 22,
 'A00400301PNPWZTJFR6RA': 23,
 'A00406841NIK2FFZHXPC8': 24,
 'A004114635BWU4ERDHCS': 25,
 'A004176027NXFTHNBG3MW': 26,
 'A00430543UQNYMFBF08BJ': 27,
 'A00436183LI7CN7VLFC18': 28,
 'A0044682K778KE5L6NB1': 29,
 'A00464861ZAB9YZAL3KXS': 30,
 'A00473363TJ8YSZ3YAGG9': 31,
 'A00477922HSN48OM8838J': 32,
 'A00495601IFV8DHJ1KD78': 

In [3]:
item_mapper

{'0981850006': 0,
 '144072007X': 1,
 '1554701503': 2,
 '1579822932': 3,
 '1754164498': 4,
 '1880241064': 5,
 '1885010753': 6,
 '1938146824': 7,
 '398501938X': 8,
 '6035000029': 9,
 '6035000037': 10,
 '6035000045': 11,
 '6303146775': 12,
 '6304429150': 13,
 '8386244453': 14,
 '8393166616': 15,
 '8802000026': 16,
 '8802000034': 17,
 '8805001058': 18,
 '8805002577': 19,
 '8805002585': 20,
 '8805002593': 21,
 '8805002607': 22,
 '8805002615': 23,
 '8805002631': 24,
 '880500264X': 25,
 '8805002666': 26,
 '8805002674': 27,
 '8805002682': 28,
 '8805002690': 29,
 '8805002704': 30,
 '8805003255': 31,
 '8805003263': 32,
 '8805003298': 33,
 '8805003549': 34,
 '8805003573': 35,
 '8805003603': 36,
 '9800474382': 37,
 '9822221274': 38,
 'B0000001QY': 39,
 'B000000923': 40,
 'B000001ULX': 41,
 'B000001YZU': 42,
 'B000003GJ2': 43,
 'B000005R78': 44,
 'B000005XFE': 45,
 'B0000071CF': 46,
 'B00000GC69': 47,
 'B00000I2XI': 48,
 'B00000JAJ8': 49,
 'B00000K2MB': 50,
 'B0000223JC': 51,
 'B00002240Y': 52,
 'B

In [4]:
ratings

Unnamed: 0,user,item,rating,timestamp
0,A2VNYWOPJ13AFP,0981850006,5.0,1259798400
1,A20DWVV8HML3AW,0981850006,5.0,1371081600
2,A3RVP3YBYYOPRH,0981850006,5.0,1257984000
3,A28XY55TP3Q90O,0981850006,5.0,1314144000
4,A3VZW1BGUQO0V3,0981850006,5.0,1308268800
5,A2R9T5D7UVQZB0,0981850006,5.0,1253577600
6,A2MH49GAEWEI95,0981850006,5.0,1395532800
7,AR5DPX4ZU3D4Z,144072007X,1.0,1360886400
8,A3OGMCRM3NL8OM,144072007X,5.0,1360108800
9,A7E1GKNNT96QQ,144072007X,5.0,1206057600


In [5]:
user_inverse_mapper

{0: 'A0001528BGUBOEVR6T5U',
 1: 'A00066383LNA9T6M0S9ZW',
 2: 'A00070823AX8NX792V4RL',
 3: 'A0009478CBXKUCALUC7U',
 4: 'A001211614UB7M3DVLGSE',
 5: 'A001269818K3KT1NIR3O6',
 6: 'A001573229XK5T8PI0OKA',
 7: 'A00181843LH73EH2Y9CV2',
 8: 'A00188842369XRWY4HKOV',
 9: 'A00217381F34RIAAUFPCH',
 10: 'A002552825SX31CD1Q186',
 11: 'A00259242VSCRZPGIWP0M',
 12: 'A002727434Z0KOOT6LR9Z',
 13: 'A002759211C416DRR5RKE',
 14: 'A002764429D4LF2SJIT18',
 15: 'A0029274J35Q1MYNKUWO',
 16: 'A00311601FOXCO2AIRNIF',
 17: 'A003267413RXJJ2LHCLHA',
 18: 'A00328401T70RFN4P1IT6',
 19: 'A003322213RZ15CXJBET2',
 20: 'A00339243ILJT7ZBM764R',
 21: 'A003658417580D9H97XL6',
 22: 'A0038640S18JE5Y497U6',
 23: 'A00400301PNPWZTJFR6RA',
 24: 'A00406841NIK2FFZHXPC8',
 25: 'A004114635BWU4ERDHCS',
 26: 'A004176027NXFTHNBG3MW',
 27: 'A00430543UQNYMFBF08BJ',
 28: 'A00436183LI7CN7VLFC18',
 29: 'A0044682K778KE5L6NB1',
 30: 'A00464861ZAB9YZAL3KXS',
 31: 'A00473363TJ8YSZ3YAGG9',
 32: 'A00477922HSN48OM8838J',
 33: 'A00495601IFV8DHJ1KD7

In [6]:
item_inverse_mapper

{0: '0981850006',
 1: '144072007X',
 2: '1554701503',
 3: '1579822932',
 4: '1754164498',
 5: '1880241064',
 6: '1885010753',
 7: '1938146824',
 8: '398501938X',
 9: '6035000029',
 10: '6035000037',
 11: '6035000045',
 12: '6303146775',
 13: '6304429150',
 14: '8386244453',
 15: '8393166616',
 16: '8802000026',
 17: '8802000034',
 18: '8805001058',
 19: '8805002577',
 20: '8805002585',
 21: '8805002593',
 22: '8805002607',
 23: '8805002615',
 24: '8805002631',
 25: '880500264X',
 26: '8805002666',
 27: '8805002674',
 28: '8805002682',
 29: '8805002690',
 30: '8805002704',
 31: '8805003255',
 32: '8805003263',
 33: '8805003298',
 34: '8805003549',
 35: '8805003573',
 36: '8805003603',
 37: '9800474382',
 38: '9822221274',
 39: 'B0000001QY',
 40: 'B000000923',
 41: 'B000001ULX',
 42: 'B000001YZU',
 43: 'B000003GJ2',
 44: 'B000005R78',
 45: 'B000005XFE',
 46: 'B0000071CF',
 47: 'B00000GC69',
 48: 'B00000I2XI',
 49: 'B00000JAJ8',
 50: 'B00000K2MB',
 51: 'B0000223JC',
 52: 'B00002240Y',
 53

In [7]:
X[1,1]

0.0

In [8]:
user_mapper['A2VNYWOPJ13AFP']

355005

In [9]:
item_mapper['0981850006']

0

In [10]:
X[355005,0]

5.0

In [11]:
X.dtype

dtype('float64')

In [12]:
X

<714791x105984 sparse matrix of type '<class 'numpy.float64'>'
	with 993490 stored elements in Compressed Sparse Row format>

In [13]:
X.getA()

AttributeError: getA not found

In [16]:
a

matrix([[35., 11., 10., ..., 30., 25., 25.]])

In [17]:
a.max()

14454.0

In [18]:
a.argmax()

10959

In [19]:
a[0,10959]

14454.0

In [20]:
a[0]

matrix([[35., 11., 10., ..., 30., 25., 25.]])

In [21]:
star_record = np.sum(X, axis = 0) 
print("the item with the most total stars:", star_record.argmax())
print("he total stars of the item:",star_record.max())

the item with the most total stars: 10959
he total stars of the item: 14454.0


In [22]:
item_inverse_mapper[star_record.argmax()]

'B000HCLLMM'

In [14]:
X[X > 0] = 1
user_record = np.sum(X, axis = 1)

In [15]:
user_record = user_record.T

In [16]:
user_record


matrix([[1., 1., 2., ..., 1., 1., 2.]])

In [19]:
user_record[0,1]

1.0

In [None]:
plt.hist(user_record[0],bins = 1)

In [28]:
user_record[0,659]

161.0

In [None]:
plt.hist(user_record, bins=1)

In [None]:
plt.hist(user_record, normed=True, bins=4)

In [82]:
user_record

matrix([[1.],
        [1.],
        [2.],
        ...,
        [1.],
        [1.],
        [2.]])

In [83]:
test = np.random.normal(size = 1000)

In [84]:
test

array([ 7.72040560e-01, -1.81846109e+00,  7.20445086e-01, -5.23353961e-01,
        6.77293321e-01,  1.08620069e-01, -1.75619331e+00, -6.81341427e-02,
        6.77056619e-01,  5.00470584e-01, -2.67606963e-01,  8.73277282e-01,
       -2.07660708e-02, -2.01527476e-01,  3.39969927e-01,  1.27483300e-01,
        9.32330790e-01,  5.60211143e-01, -1.17168130e+00, -1.23682438e+00,
        6.37965561e-02,  1.99972680e+00,  2.79416672e-01, -6.13188765e-01,
        1.18183219e+00,  1.29951889e+00,  7.42265506e-01, -9.40165752e-01,
       -8.63351984e-01,  1.42311969e+00, -2.66115834e-01, -1.09876015e+00,
        1.56505607e+00, -6.19349213e-01,  1.70034244e+00, -3.62839229e-01,
       -5.04377249e-01,  7.39290574e-01, -1.71959222e+00, -1.28362646e+00,
       -1.25492945e+00, -6.66705957e-01, -3.40990034e-01, -5.35182777e-01,
        1.76544209e-01,  1.02226827e-01, -9.94961523e-01, -3.72629812e-02,
       -2.01189038e-01,  3.11187016e-01,  1.71624122e+00, -7.84557091e-01,
       -1.12059650e+00,  

In [86]:
user_mapper['A100WO06OQR8BQ']

659

In [88]:
user_record(659,0)

TypeError: 'matrix' object is not callable

In [89]:
user_record[659,0]

161.0

In [90]:
user_record = user_record.getA()

In [91]:
user_record

array([[1.],
       [1.],
       [2.],
       ...,
       [1.],
       [1.],
       [2.]])

In [92]:
user_record.T

array([[1., 1., 2., ..., 1., 1., 2.]])

In [3]:
np.log([4, 5, 6, 7, 8])

array([1.38629436, 1.60943791, 1.79175947, 1.94591015, 2.07944154])

In [4]:
np.exp([3, 5, 6, 7])/np.exp([5, 6, 7, 8])

array([0.13533528, 0.36787944, 0.36787944, 0.36787944])

In [5]:
bias = np.ones((1,10))

In [12]:
bias

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [17]:
for i in range(1, 10):
    print(i)

1
2
3
4
5
6
7
8
9


In [22]:
data = load_dataset("basisData.pkl")
X = data['X']
y = data['y']
Xtest = data['Xtest']
ytest = data['ytest']


In [24]:
X

array([[-9.94077278e+00],
       [-9.93261613e+00],
       [-9.91105128e+00],
       [-9.45262505e+00],
       [-9.23975484e+00],
       [-9.18385609e+00],
       [-9.10374844e+00],
       [-9.00601393e+00],
       [-8.91278900e+00],
       [-8.86633857e+00],
       [-8.71882158e+00],
       [-8.67018701e+00],
       [-8.64114829e+00],
       [-8.51059333e+00],
       [-8.31429568e+00],
       [-8.27279024e+00],
       [-8.24289698e+00],
       [-8.20452732e+00],
       [-8.16787474e+00],
       [-8.16601023e+00],
       [-8.10799907e+00],
       [-8.07951041e+00],
       [-8.04271154e+00],
       [-7.82847222e+00],
       [-7.80005610e+00],
       [-7.79105386e+00],
       [-7.77848228e+00],
       [-7.77438858e+00],
       [-7.76477059e+00],
       [-7.67418753e+00],
       [-7.54447974e+00],
       [-7.45397389e+00],
       [-7.33936923e+00],
       [-7.22691757e+00],
       [-6.93687486e+00],
       [-6.78946032e+00],
       [-6.50730569e+00],
       [-6.46609111e+00],
       [-6.2

In [26]:
X**np.arange(3)

array([[ 1.00000000e+00, -9.94077278e+00,  9.88189636e+01],
       [ 1.00000000e+00, -9.93261613e+00,  9.86568632e+01],
       [ 1.00000000e+00, -9.91105128e+00,  9.82289374e+01],
       [ 1.00000000e+00, -9.45262505e+00,  8.93521203e+01],
       [ 1.00000000e+00, -9.23975484e+00,  8.53730695e+01],
       [ 1.00000000e+00, -9.18385609e+00,  8.43432128e+01],
       [ 1.00000000e+00, -9.10374844e+00,  8.28782357e+01],
       [ 1.00000000e+00, -9.00601393e+00,  8.11082869e+01],
       [ 1.00000000e+00, -8.91278900e+00,  7.94378078e+01],
       [ 1.00000000e+00, -8.86633857e+00,  7.86119595e+01],
       [ 1.00000000e+00, -8.71882158e+00,  7.60178498e+01],
       [ 1.00000000e+00, -8.67018701e+00,  7.51721427e+01],
       [ 1.00000000e+00, -8.64114829e+00,  7.46694438e+01],
       [ 1.00000000e+00, -8.51059333e+00,  7.24301988e+01],
       [ 1.00000000e+00, -8.31429568e+00,  6.91275127e+01],
       [ 1.00000000e+00, -8.27279024e+00,  6.84390583e+01],
       [ 1.00000000e+00, -8.24289698e+00

In [28]:
index = [1,2,4]
X[index]

array([[-9.93261613],
       [-9.91105128],
       [-9.23975484]])

In [30]:
import scipy

array([32, 73])