In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from collections import Counter, defaultdict
import matplotlib.pyplot as plt

In [3]:
users = pd.read_csv("data/users.csv")
users = users.drop('yelping_since', axis = 1)
restaurants = pd.read_csv("data/restaurants.csv")
reviews = pd.read_csv("data/reviews.csv")
tips = pd.read_csv("data/tips.csv")

In [4]:
reviews_sub = reviews[(reviews['business_id'].isin(restaurants['business_id'])) & (reviews['user_id'].isin(users['user_id']))].copy()
user_encoder = LabelEncoder().fit(users['user_id'])
restaurant_encoder = LabelEncoder().fit(restaurants['business_id'])
reviews_sub['user_id'] = user_encoder.transform(reviews_sub['user_id'])
reviews_sub['business_id'] = restaurant_encoder.transform(reviews_sub['business_id'])

### Prepare data

In [5]:
users_train, user_test = train_test_split(users, test_size = 0.2, random_state = 48)
user_train, user_val = train_test_split(users_train, test_size = 0.3, random_state = 48)

In [6]:
print("Number of users in the Training:  ", users_train.shape[0])
print("Number of users in the Validation:  ", user_val.shape[0])
print("Number of users in the Testing:  ", user_test.shape[0])

Number of users in the Training:   48680
Number of users in the Validation:   14604
Number of users in the Testing:   12171


In [7]:
reviews_train = reviews_sub[reviews_sub['user_id'].isin(user_encoder.transform(users_train['user_id']))].copy()
reviews_val = reviews_sub[reviews_sub['user_id'].isin(user_encoder.transform(user_val['user_id']))].copy()
reviews_test = reviews_sub[reviews_sub['user_id'].isin(user_encoder.transform(user_test['user_id']))].copy()

In [8]:
# rating_matrix =  csr_matrix((reviews_sub['stars'], (reviews_sub['user_id'], reviews_sub['business_id'])), shape = (users.shape[0], restaurants.shape[0]))

### With Raw Score

In [9]:
import torch
from torch import optim
from torch.autograd import Variable

In [10]:
num_items, num_users = reviews_train['business_id'].nunique(), reviews_train['user_id'].nunique()
print("Num of items:  {}".format(num_items))
print("Num of users:  {}".format(num_users))

Num of items:  12665
Num of users:  48680


In [14]:
train_user_encoder = LabelEncoder().fit(reviews_train['user_id'])
reviews_train['user_id'] = train_user_encoder.transform(reviews_train['user_id'])
train_restaurant_encoder = LabelEncoder().fit(reviews_train['business_id'])
reviews_train['business_id'] = train_restaurant_encoder.transform(reviews_train['business_id'])
rating_matrix = csr_matrix((reviews_train['stars'], (reviews_train['user_id'], reviews_train['business_id'])), shape = (num_users, num_items))
# rating_matrix_mask = np.where(rating_matrix == 0, 0, 1)

In [15]:
rating_matrix_coo = rating_matrix.tocoo()
rating_matrix_tensor = torch.sparse.LongTensor(torch.LongTensor([rating_matrix_coo.row.tolist(), rating_matrix_coo.col.tolist()]),
                                               torch.LongTensor(rating_matrix_coo.data.astype(np.int32)))
rating_matrix = rating_matrix_tensor.to_dense().float()
rating_matrix_mask = (rating_matrix != 0)

In [None]:
hidden_neuron = 1000

V = Variable(torch.randn(hidden_neuron, num_users), requires_grad = True)
W = Variable(torch.randn(num_users, hidden_neuron), requires_grad = True)
mu = Variable(torch.randn(hidden_neuron), requires_grad = True)
b = Variable(torch.randn(num_users), requires_grad = True)


learning_rate = 0.005
optimizer = torch.optim.Adam([V, W, mu, b], lr=learning_rate)
for t in range(300):
    optimizer.zero_grad()
    layer_1 = torch.nn.Dropout(p=0.2)(torch.sigmoid(mu.view(hidden_neuron, 1) + V.matmul(rating_matrix)))
    layer_2 = W.matmul(layer_1) + b.view(num_users, 1)
    loss = torch.mean(((rating_matrix - layer_2) * rating_matrix_mask)**2) + 20 * (torch.norm(V) + torch.norm(W))
    print(t, loss.data.item())
    
    loss.backward()
    optimizer.step()

    

0 277702.65625
1 276595.0


In [19]:
layer_2.shape

torch.Size([48680, 12665])

In [25]:
rating_matrix_tensor

tensor(indices=tensor([[    0,     0,     0,  ..., 48679, 48679, 48679],
                       [  781,  1958,  3909,  ..., 10766, 12361, 12435]]),
       values=tensor([4, 4, 4,  ..., 3, 5, 5]),
       size=(48680, 12665), nnz=1264406, layout=torch.sparse_coo)

In [15]:
hidden_neuron = 1000
V = tf.Variable(tf.random.normal([hidden_neuron, num_users], stddev=0.01))
W = tf.Variable(tf.random.normal([num_users, hidden_neuron], stddev=0.01))
mu = tf.Variable(tf.random.normal([hidden_neuron], stddev=0.01))
b = tf.Variable(tf.random.normal([num_users], stddev=0.01))
layer_1 = tf.nn.dropout(tf.sigmoid(tf.expand_dims(mu, 1) + tf.matmul(V, rating_matrix)), 0.95)
layer_2 = tf.matmul(W, layer_1) + tf.expand_dims(b, 1)
loss = tf.reduce_mean(tf.square(tf.norm(tf.multiply((rating_matrix - layer_2), rating_matrix_mask)))) + \
       20 * (tf.square(tf.norm(W)) + tf.square(tf.norm(V)))
optimizer = tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss)

ResourceExhaustedError: OOM when allocating tensor with shape[48680,12665] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Sub]

In [None]:
num_iter = 100
batch

In [None]:
self.rating_matrix = tf.placeholder(dtype=tf.float32, shape=[self.num_user, None])
        self.rating_matrix_mask = tf.placeholder(dtype=tf.float32, shape=[self.num_user, None])
        self.keep_rate_net = tf.placeholder(tf.float32)
        self.keep_rate_input = tf.placeholder(tf.float32)

        V = tf.Variable(tf.random_normal([hidden_neuron, self.num_user], stddev=0.01))
        W = tf.Variable(tf.random_normal([self.num_user, hidden_neuron], stddev=0.01))

        mu = tf.Variable(tf.random_normal([hidden_neuron], stddev=0.01))
        b = tf.Variable(tf.random_normal([self.num_user], stddev=0.01))
        layer_1 = tf.nn.dropout(tf.sigmoid(tf.expand_dims(mu, 1) + tf.matmul(V, self.rating_matrix)),
                                self.keep_rate_net)
        self.layer_2 = tf.matmul(W, layer_1) + tf.expand_dims(b, 1)
        
        self.loss = tf.reduce_mean(tf.square(
            tf.norm(tf.multiply((self.rating_matrix - self.layer_2), self.rating_matrix_mask)))) + self.reg_rate * (
        tf.square(tf.norm(W)) + tf.square(tf.norm(V)))
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

    def train(self, train_data):
        self.num_training = self.num_item
        total_batch = int(self.num_training / self.batch_size)
        idxs = np.random.permutation(self.num_training)  # shuffled ordering

        for i in range(total_batch):
            start_time = time.time()
            if i == total_batch - 1:
                batch_set_idx = idxs[i * self.batch_size:]
            elif i < total_batch - 1:
                batch_set_idx = idxs[i * self.batch_size: (i + 1) * self.batch_size]

            _, loss = self.sess.run([self.optimizer, self.loss],
                                    feed_dict={self.rating_matrix: self.train_data[:, batch_set_idx],
                                               self.rating_matrix_mask: self.train_data_mask[:, batch_set_idx],
                                               self.keep_rate_net: 0.95
                                               })
            if i % self.display_step == 0:
                if self.verbose:
                    print("Index: %04d; cost= %.9f" % (i + 1, np.mean(loss)))
                    print("one iteration: %s seconds." % (time.time() - start_time))

### W-Auto

In [9]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [60]:
w_users_train = users_train.copy()

user_scaler = StandardScaler()
compliment_columns = [c for c in users_train.columns if 'compliment_' in c]
receive_columns = ['useful', 'funny', 'cool']
other_columns = ['review_count', 'fans']
std_columns = compliment_columns + receive_columns + other_columns
w_users_train[std_columns] = user_scaler.fit_transform(w_users_train[std_columns])

In [185]:
compliment_pca = PCA(n_components = 5)
w_users_train['pca_compliment'] = compliment_pca.fit_transform(w_users_train[compliment_columns])[:,0]
w_users_train = w_users_train.drop(compliment_columns, axis = 1)
print("Compliment PCA:  ", compliment_pca.explained_variance_ratio_)

receive_pca = PCA(n_components = 2)
w_users_train['pca_receive'] = receive_pca.fit_transform(w_users_train[receive_columns])[:,0]
print("Receive PCA:  ",receive_pca.explained_variance_ratio_)
w_users_train = w_users_train.drop(receive_columns, axis = 1)

Compliment PCA:   [0.84199696 0.06877286 0.0367535  0.02212756 0.01406016]
Receive PCA:   [0.98199789 0.01542783]


In [186]:
w_users_train.head()

Unnamed: 0,user_id,review_count,elite,fans,average_stars,pca_compliment,pca_receive
39340,dqR9UnjbkeP9ruhh4FcxgA,-0.447159,0,-0.159289,4.37,-0.230614,-0.233754
25321,cbf0Nl5p-rzvpCPu0594gA,0.680536,1,0.052579,3.79,-0.111462,-0.06675
49813,yJRx7eLfrErj8jOSK-FeFQ,-0.302673,1,-0.135748,4.09,-0.194875,-0.204825
25148,N4GCsUsuUmwDa6-QyM5VdA,-0.426015,0,-0.159289,2.57,-0.231187,-0.233679
22779,uP4Bl-0Al2NSQ-tyKtaHKA,0.151929,0,-0.123978,2.04,-0.18276,-0.094232


In [205]:
kmeans = KMeans(n_clusters=15)
kmeans.fit(w_users_train.iloc[:, 1:])
w_users_train['kmeans_labels'] = kmeans.labels_
Counter(w_users_train['kmeans_labels'])

Counter({1: 17424,
         8: 4190,
         0: 13002,
         3: 11888,
         10: 1346,
         7: 540,
         14: 45,
         4: 188,
         6: 18,
         2: 6,
         13: 28,
         12: 2,
         11: 1,
         9: 1,
         5: 1})

### Clustering on AutoEncoder

In [42]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [45]:
enc_input = Input(shape = (18, ))
enc_1 = Dense(12, activation = 'relu')(enc_input)
enc_2 = Dense(8, activation = 'relu')(enc_1)
enc_3 = Dense(3, activation = 'relu')(enc_1)
dec_1 = Dense(8, activation = 'relu')(enc_3)
dec_output = Dense(18)(dec_1)
autoencoder = Model(enc_input, dec_output)
encoder = Model(enc_input, enc_3)

dec_input = Input(shape=enc_3.shape)
dec_out = autoencoder.layers[-1](autoencoder.layers[-2](dec_input))
decoder = Model(dec_input, dec_out)


In [61]:
w_users_train = users_train.copy()

user_scaler = StandardScaler()
compliment_columns = [c for c in users_train.columns if 'compliment_' in c]
receive_columns = ['useful', 'funny', 'cool']
other_columns = ['review_count', 'fans']
std_columns = compliment_columns + receive_columns + other_columns
w_users_train[std_columns] = user_scaler.fit_transform(w_users_train[std_columns])

In [62]:
w_users_val = user_val.copy()
w_users_val[std_columns] = user_scaler.fit_transform(w_users_val[std_columns])

In [63]:
adam = Adam(learning_rate = 1e-3)
autoencoder.compile(optimizer=adam, loss='mse')
autoencoder.fit(w_users_train.iloc[:, 1:], w_users_train.iloc[:, 1:], batch_size = 64, epochs = 100, 
                validation_data=(w_users_val.iloc[:, 1:], w_users_val.iloc[:, 1:]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f382c5d34f0>

In [67]:
encoded_space = encoder.predict(w_users_train.iloc[:, 1:])

In [68]:
encoded_space

array([[7.1242633 , 0.11488634, 0.05356139],
       [7.0498    , 1.0099747 , 1.2676926 ],
       [7.383601  , 0.72128135, 0.41228116],
       ...,
       [6.2561717 , 0.86541784, 0.56980085],
       [6.185733  , 0.8329054 , 0.48785013],
       [5.9105144 , 0.15549776, 0.05326289]], dtype=float32)

In [77]:
kmeans = KMeans(n_clusters = 15)
kmeans.fit(encoded_space)
Counter(kmeans.labels_)

Counter({0: 17642,
         6: 8249,
         9: 19452,
         14: 2530,
         7: 535,
         2: 167,
         8: 14,
         11: 10,
         10: 70,
         3: 2,
         12: 1,
         13: 3,
         1: 3,
         4: 1,
         5: 1})

In [78]:
"What is a good example of a question answering dataset?"[147:161]

''