In [5]:
import pickle
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from scipy.linalg import svd
import statsmodels.api as sm
import sklearn.preprocessing as preprocessing
import pymc3
label_encoder = LabelEncoder()
def dump(file, path):
    pickle.dump(file, open(path, "wb"))

In [2]:
def logistic(x):
    return np.exp(x) / (np.exp(x) + 1)
def beta_reparameterize(pi,phi_beta):
    """ mean-precision to standard """
    return pi / phi_beta, (1 - pi) / phi_beta

def logit(x):
    return np.log(x/(1-x))


def fit_best_gamma_phi(p, X, theta_hat):
    """ suppose we can see all v w/o error and in hindsight, find the best phi, gamma; for debug purpose"""
    import pymc3 as pm
    n_init = 2000
    n_tune = 200
    chains = 1
    n_sample = 2000

    with pm.Model() as Cascading_Normal_Beta:
        gamma_temp = pm.MvNormal('gamma', mu=np.zeros(p), cov=np.identity(p),shape=p)
        phi = pm.Beta('phi', alpha= 1, beta=1, shape=1)
        alpha_temp = pm.math.dot(X, gamma_temp)
        mean_theta = logistic(alpha_temp)
        alpha_Beta, beta_Beta = beta_reparameterize(mean_theta, phi)
        theta = theta_hat
        theta = pm.Beta('theta', alpha= alpha_Beta, beta=beta_Beta, shape=L, observed = theta)
        trace = pm.sample(n_sample, tune = n_tune, chains = chains
                          , cores = 1, progressbar = 1, init='adapt_diag',
                          target_accept=0.95, trace = None);
    return {'gamma' : np.mean(trace["gamma"], 0), 'phi' : np.mean(trace["phi"], 0)}

In [None]:
review_origin = pd.read_csv("yelp_review.csv") 
#review_origin.head()
business_origin= pd.read_csv("yelp_business.csv") 
#business_origin.head()
#get the list of business that are restaurants
restaurants = business_origin[business_origin.categories.apply(lambda x : 'Restaurants' in x)].business_id
#get the subset of reviews for restaurants only
reviews = review_origin[review_origin.business_id.isin(restaurants)][['user_id', 'business_id', 'stars',]]
#top 3k most reviewd restaurants
L=3000
selected_business = list(reviews.business_id.value_counts().index[:L])
#top 20k most reviewing users
U=20000
selected_user = list(reviews.user_id.value_counts().index[:U])
#Final dataset
reviews = reviews[reviews.user_id.isin(selected_user)]
reviews = reviews[reviews.business_id.isin(selected_business)]
#convert the user_id and business_id into integer numbers
reviews['business'] = label_encoder.fit_transform(reviews['business_id'])
reviews['user'] = label_encoder.fit_transform(reviews['user_id'])

In [3]:
#get the feedback matrix: =1 if reviewed before
num_users = U
num_business = L
W_feedback = np.zeros((num_users, num_business))
for i in range(reviews.shape[0]):
    W_feedback[reviews.user.iloc[i],reviews.business.iloc[i]] = 1
    
users = list(range(U))
# randomly permuting the users
random.shuffle(users)
W_feedback = W_feedback[users,:]

#divide into train set and test set (half-half)
W_train = W_feedback[:U//2,:] #W_train to learn the features
W_test = W_feedback[U//2:,:] #W_test to be used in the experiment

In [5]:
theta_hat = np.mean(W_test,axis = 0)
y = logit(theta_hat)
out = {'theta_hat': theta_hat, 'y':y}
U, s, VT = svd(W_train)
out1={'U': U, 's':s,'VT':VT}

In [6]:
with open("W_test_relate.txt", "wb") as fp:
     pickle.dump(out,fp)
with open("W_test.txt", "wb") as fp:
     pickle.dump(W_test,fp)
with open("W_train.txt", "wb") as fp:
     pickle.dump(W_train,fp)
with open("USV.txt", "wb") as fp:
     pickle.dump(out1,fp)

In [3]:
with open("s.txt", "rb") as fp:
     s = pickle.load(fp)
with open("VT.txt", "rb") as fp:
     VT = pickle.load(fp)
with open("W_test_relate.txt", "rb") as fp:
     out = pickle.load(fp)
theta_hat = out['theta_hat']
y = out['y']
User = 20000
L = 3000

In [None]:
for X_transform in ['standardize']:
    for with_intercept in [1]:
        for d in [10]:
            if X_transform in ['standardize', 'l2_then_standardize'] and with_intercept == 0:
                continue
            print('d = {}, X_transform = {}, with_intercept = {}'.format(d, X_transform, with_intercept))
            X = business_features = np.matmul(VT.T[:,:d],np.diag(s[:d]))
            if X_transform == 'standardize':
                from sklearn.preprocessing import StandardScaler
                scaler = StandardScaler()
                scaler.fit(X)
                X = scaler.transform(X)
            elif X_transform == 'l2_then_standardize':
                X = preprocessing.normalize(X, norm='l2') 
                from sklearn.preprocessing import StandardScaler
                scaler = StandardScaler()
                scaler.fit(X)
                X = scaler.transform(X)
            else:
                X = preprocessing.normalize(X, norm='l2') 

            if with_intercept:
                X = sm.add_constant(X)
            
            results = fit_best_gamma_phi(X.shape[1], X, theta_hat)
            gamma = results['gamma']
            best_phi = results['phi'][0]
            y_mean = X.dot(gamma)
            theta_mean = logistic(y_mean)

            print("fitting R2 = {:.2f}".format(r2_score(y, y_mean)))
            print("fitting R2 for theta = {:.2f}".format(r2_score(theta_hat, theta_mean)))

            print('max_para = {:.2f}'.format(max(abs(gamma))))
            print('best_phi = {:.2f}'.format(best_phi))

            out = {
                   'W_test_mean': theta_hat,
                   'movie_features':X,
                   'true_gamma_wrt_test': gamma,
                   'true_phi_wrt_test': best_phi,
            }

            fp = 'Cascading_real_dataset/Cascading_realdata_d_{}_X_transform_{}_with_intercept_{}'.format(d, X_transform, with_intercept)
            dump(out, fp)    
            print("\n")
            # _binary
        print("*" * 100)

## Data used for real experiment: 
- W_test.txt
- Cascading_realdata_d_10_X_transform_standardize_with_intercept_1