## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [1]:
import pandas as pd
import tarfile
from tqdm import tqdm
import json
import requests
import xlearn as xl
import networkx as nx
pd.options.mode.chained_assignment = None
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

In [2]:
#line_count = len(open("data/review.json").readlines())
line_count = len(open("data/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts = [], [], [], [], [], [], []
with open("data/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts }
)

100%|██████████| 192609/192609 [00:03<00:00, 57293.75it/s]


In [3]:
#line_count = len(open("data/review.json").readlines())
line_count = len(open("data/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("data/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:24<00:00, 66326.41it/s]


In [4]:
ratings_holdout = pd.read_csv('data/ratings_sample_holdout.csv')

In [5]:
ratings_train = pd.read_csv('data/ratings_sample_train.csv')

In [6]:
ratings_val = pd.read_csv('data/ratings_sample_cv.csv')

In [7]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

In [8]:
ratings_train = process(ratings_train.copy())

In [9]:
ratings_holdout = process(ratings_holdout.copy())

In [10]:
ratings_val = process(ratings_val.copy())

In [11]:
print('There are {0} rows, {1} columns in training set.'.format(ratings_train.shape[0], ratings_train.shape[1]))
print('There are {0} rows, {1} columns in holdout set.'.format(ratings_holdout.shape[0], ratings_holdout.shape[1]))
print('There are {0} rows, {1} columns in validation set.'.format(ratings_val.shape[0], ratings_val.shape[1]))

There are 406042 rows, 18 columns in training set.
There are 28612 rows, 18 columns in holdout set.
There are 28615 rows, 18 columns in validation set.


In [12]:
import math
def _convert_to_ffm(path, df, type, target, numerics, categories, features, encoder):
    # Flagging categorical and numerical fields
    print('convert_to_ffm - START')
    for x in numerics:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: numeric field - {x}')
            encoder['catdict'][x] = 0
    for x in categories:
        if(x not in encoder['catdict']):
            print(f'UPDATING CATDICT: categorical field - {x}')
            encoder['catdict'][x] = 1

    nrows = df.shape[0]
    with open(path + str(type) + "_ffm.txt", "w") as text_file:

        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow[target]))  # Set Target Variable here

            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(encoder['catdict'].keys()):
                if(encoder['catdict'][x] == 0):
                    # Not adding numerical values that are nan
                    if math.isnan(datarow[x]) is not True:
                        datastring = datastring + " "+str(i)+":" + str(i)+":" + str(datarow[x])
                else:

                    # For a new field appearing in a training example
                    if(x not in encoder['catcodes']):
                        print(f'UPDATING CATCODES: categorical field - {x}')
                        encoder['catcodes'][x] = {}
                        encoder['currentcode'] += 1
                        print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    # For already encoded fields
                    elif(datarow[x] not in encoder['catcodes'][x]):
                        encoder['currentcode'] += 1
                        print(f'UPDATING CATCODES: categorical value for field {x} - {datarow[x]}')
                        encoder['catcodes'][x][datarow[x]] = encoder['currentcode']  # encoding the feature

                    code = encoder['catcodes'][x][datarow[x]]
                    datastring = datastring + " "+str(i)+":" + str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

    # print('Encoder Summary:')
    # print(json.dumps(encoder, indent=4))
    return encoder

### Create user text embeddings based on training data only

In [13]:
rating_texts = ratings_train.groupby(['user_id'])['text'].apply(lambda x: ','.join(x)).reset_index()

In [14]:
rating_texts['text'] = rating_texts['text'].str.lower()

In [15]:
rating_texts['text'] = rating_texts['text'].str.replace(r"[^a-zA-Z ]+", " ").str.strip()

In [16]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words = 'english', strip_accents = 'ascii')
vectorizer.fit(rating_texts['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents='ascii',
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [17]:
vector = vectorizer.transform(rating_texts['text'])

In [18]:
tsv = TruncatedSVD(n_components=50)

In [19]:
tsv.fit(vector)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
             random_state=None, tol=0.0)

In [20]:
transformed_tsv = tsv.transform(vector)

In [21]:
'''
wcss=[]
for i in range(77, 100):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(transformed_tsv)
    wcss.append(kmeans.inertia_)
plt.plot(range(2, 100), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
'''

"\nwcss=[]\nfor i in range(77, 100):\n    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)\n    kmeans.fit(transformed_tsv)\n    wcss.append(kmeans.inertia_)\nplt.plot(range(2, 100), wcss)\nplt.title('Elbow Method')\nplt.xlabel('Number of clusters')\nplt.ylabel('WCSS')\nplt.show()\n"

In [22]:
kmeans = KMeans(n_clusters=110, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(transformed_tsv)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=110, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [23]:
text_cluster = kmeans.predict(transformed_tsv)

In [24]:
rating_texts.loc[:,'text_cluster'] = text_cluster

In [25]:
rating_texts_features = rating_texts[['user_id','text_cluster']]

In [26]:
ratings_train= ratings_train.merge(rating_texts_features[['user_id','text_cluster']], on ='user_id', how = 'left')

In [27]:
ratings_holdout = ratings_holdout.merge(rating_texts_features[['user_id','text_cluster']], on ='user_id', how = 'left')

In [28]:
ratings_val = ratings_val.merge(rating_texts_features[['user_id','text_cluster']], on ='user_id', how = 'left')

### Create Graph Features

In [29]:
train_users = ratings_train[['user_id','friends']].drop_duplicates()

In [30]:
train_users_dict = train_users.set_index('user_id').T.to_dict('list')

In [31]:
g = nx.Graph()
g.add_nodes_from(train_users_dict.keys())

In [32]:
for k, v in train_users_dict.items():
    for i in v[0].split(','):
        g.add_edge(k,i.strip())    

In [33]:
sub_graphs = nx.connected_component_subgraphs(g)

In [34]:
sgs =[]
for i, sg in enumerate(sub_graphs):
    sgs += [sg]

In [35]:
fin_sgs = []
for i in sgs:
    if len(i.nodes()) >=5:
        fin_sgs +=[i]

In [36]:
graph_user_ids, graph_ids = [],[]
num_id = 0
for graph in fin_sgs:
    for node in graph.nodes():
        graph_user_ids += [node]
        graph_ids += [num_id]
    num_id += 1

In [37]:
social_graphs = pd.DataFrame(
    {"user_id": graph_user_ids, "graph_cluster": graph_ids}
)

In [38]:
ratings_train= ratings_train.merge(social_graphs, on ='user_id', how = 'left')

In [39]:
ratings_holdout = ratings_holdout.merge(social_graphs, on ='user_id', how = 'left')

In [40]:
ratings_val = ratings_val.merge(social_graphs, on ='user_id', how = 'left')

### Create Location Features

In [41]:
X = ratings_train[['longitude','latitude']]

In [42]:
''' 
EXPLORATORY ANALYSIS TO DETERMINE NUMBER OF CLUSTERS. DON'T RUN
HERE USING THE EBLOW METHOD WE CHOOSE NCLUSTERs OF 10 

X = ratings_train[['longitude','latitude']]
wcss = []
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(2, 50), wcss[0:48])
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
'''

" \nEXPLORATORY ANALYSIS TO DETERMINE NUMBER OF CLUSTERS. DON'T RUN\nHERE USING THE EBLOW METHOD WE CHOOSE NCLUSTERs OF 10 \n\nX = ratings_train[['longitude','latitude']]\nwcss = []\nfor i in range(2, 100):\n    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)\n    kmeans.fit(X)\n    wcss.append(kmeans.inertia_)\nplt.plot(range(2, 50), wcss[0:48])\nplt.title('Elbow Method')\nplt.xlabel('Number of clusters')\nplt.ylabel('WCSS')\nplt.show()\n"

In [43]:
kmeans = KMeans(n_clusters=10, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [44]:
ratings_train.loc[:,'location_cluster'] = kmeans.predict(ratings_train[['longitude','latitude']])
ratings_holdout.loc[:,'location_cluster'] = kmeans.predict(ratings_holdout[['longitude','latitude']])
ratings_val.loc[:,'location_cluster'] = kmeans.predict(ratings_val[['longitude','latitude']])

In [45]:
ratings_train.text_cluster.fillna('999', inplace=True)
ratings_holdout.text_cluster.fillna('999', inplace=True)
ratings_val.text_cluster.fillna('999', inplace=True)

In [46]:
ratings_train.location_cluster.fillna('999', inplace=True)
ratings_holdout.location_cluster.fillna('999', inplace=True)
ratings_val.location_cluster.fillna('999', inplace=True)

In [47]:
ratings_train.graph_cluster.fillna('999', inplace=True)
ratings_holdout.graph_cluster.fillna('999', inplace=True)
ratings_val.graph_cluster.fillna('999', inplace=True)

### FFM CLASS

In [48]:
class FFMModel:

    def __init__(self, train, val,test, config, suffix = None):
        self.train_df = train
        self.val_df = val
        self.test_df = test
        self.model = xl.create_ffm()
        self.suffix = suffix
        self.config = config
        self.preds = None 
          
    def __configure(self):
        destination = self.config['destination']
        label = self.config['label']
        numerical_columns  = self.config['numerical_columns']
        categorical_columns  = self.config['categorical_columns']
        all_columns  = numerical_columns + categorical_columns
              
        encoder = {"currentcode": len(self.config['numerical_columns']),
           "catdict": {},
           "catcodes": {}}
        
        encoder = _convert_to_ffm(destination, self.train_df , 'train', label, numerical_columns, categorical_columns, all_columns, encoder)
        encoder = _convert_to_ffm(destination, self.val_df , 'test', label, numerical_columns, categorical_columns, all_columns, encoder)
        
        return encoder
        
    def train(self, params = None):
        encoder = self.__configure()
        self.model.setTrain(self.config['destination']+'train_ffm.txt')
        self.model.setValidate(self.config['destination']+'test_ffm.txt')
        self.model.setTest("data/test_ffm.txt")
        
        if not params:
            params = {'task': 'reg',
                     'lr': 0.2,
                     'lambda': 0.002,
                     'metric': 'auc'}
        
        self.model.fit(params, self.config['model_destination']+self.config['model_name']+'.out')

    def evaluate_on_val(self):
        self.model.predict(self.config['model_destination'] + self.config['model_name']+'.out', self.config['output_destination']+'predictions.txt')
        preds = pd.read_csv(self.config['output_destination']+'predictions.txt', sep=" ", header=None)
        self.preds = preds
        return preds            
    
    def get_rmse(self):
        if self.preds is None:
            self.evaluate_on_val()
        
        predictions = self.evaluate_on_val()
        test_df = ratings_holdout.copy()
        test_df['preds']  = predictions.values
        return ((test_df['rating'] - test_df['preds'])**2).sum()
            

### Metrics with Just userid and business id

In [50]:
baseline_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id'],
    'numerical_columns' : [],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'baseline',
    'output_destination': 'output/'
}

In [51]:
baseline_FFM = FFMModel(ratings_train, ratings_holdout, None, baseline_config)

In [None]:
%%capture
baseline_FFM.train()

In [None]:
baseline_FFM.get_rmse()

##### Metrics with Baseline + User and Business Information

In [57]:
user_business_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id','city','state'],
    'numerical_columns' : ['business_review_counts','user_review_counts','business_average_stars','user_average_stars'],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'user_business',
    'output_destination': 'output/'
}

In [58]:
user_business_FFM = FFMModel(ratings_train, ratings_holdout, None, user_business_config)

In [59]:
%%capture
user_business_FFM.train()

In [60]:
user_business_FFM.get_rmse()

949808.1389901494

##### Metrics with Baseline + Location Cluster

In [66]:
ratings_train.columns

Index(['user_id', 'business_id', 'rating', 'date', 'text', 'city', 'state',
       'latitude', 'longitude', 'business_average_stars',
       'business_review_counts', 'user_review_counts', 'elite',
       'user_average_stars', 'friends', 'week_day', 'month', 'hour',
       'text_cluster', 'graph_cluster', 'location_cluster'],
      dtype='object')

In [63]:
location_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id','location_cluster'],
    'numerical_columns' : [],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'location',
    'output_destination': 'output/'
}

In [64]:
location_FFM = FFMModel(ratings_train, ratings_holdout, None, location_config)

In [65]:
%%capture
location_FFM.train()

In [67]:
location_FFM.get_rmse()

883527.5595913827

##### Metrics with User and Business Information + Text Cluster

In [100]:
text_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id','text_cluster'],
    'numerical_columns' : [],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'text',
    'output_destination': 'output/'
}
text_FFM = FFMModel(ratings_train, ratings_holdout, None, text_config)

In [101]:
%%capture
text_FFM.train()

In [102]:
text_FFM.get_rmse()

910152.6939438462

##### Metrics with User and Business Information + Graph Cluster

In [103]:
graph_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id','graph_cluster'],
    'numerical_columns' : [],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'graph',
    'output_destination': 'output/'
}
graph_FFM = FFMModel(ratings_train, ratings_holdout, None, graph_config)

In [104]:
%%capture
graph_FFM.train()

In [105]:
graph_FFM.get_rmse()

888433.1457932991

##### Metrics with User and Business Information + All clusters

In [106]:
all_config = {   'destination': 'data/',
    'categorical_columns':['business_id','user_id','graph_cluster','location_cluster','text_cluster'],
    'numerical_columns' : [],
    'label' : 'rating',
    'model_destination' : 'trained_models/',
    'model_name' : 'all_clusters',
    'output_destination': 'output/'
}
all_FFM = FFMModel(ratings_train, ratings_holdout, None, all_config)

In [107]:
%%capture
all_FFM.train()

In [108]:
all_FFM.get_rmse()

885141.5994894599

### Run Time Analysis

FFM runs in linear time which leads to fast runtime. Our model takes just two minutes to run

In [112]:
%%capture
import time
start_time = time.time()
baseline_FFM = FFMModel(ratings_train, ratings_holdout, None, baseline_config)
baseline_FFM.train()

In [113]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 123.04060411453247 seconds ---
