In [86]:
import pandas as pd
from pathlib import Path
from math import sqrt
import contractions
import re
import gzip
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation, FactorAnalysis, KernelPCA
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
import random

# Input Settings

In [2]:
min_opinions = 5 
input_path = "data/raw/Patio.txt.gz"
vec_method = "w2v"
hyperparameters = '{objective: "multi:softmax", learning_rate: 0.3}'
dim_method = 'pca'

# Load data

In [3]:
def load_reviews_to_df(input_path):
    reviews_array = []
    dictionary = {}
    with gzip.open(input_path) as raw_data:
        for review in raw_data:
            this_line = review.decode("utf-8").split(":")
            if len(this_line) > 1:
                dictionary[this_line[0]] = this_line[1].strip()
            else:
                reviews_array.append(dictionary)
                dictionary = {}

    col_names = ['productId', 'title', 'price', 'userId',
                'profileName', 'helpfulness', 'score',
                'time', 'summary', 'text']

    reviews = pd.DataFrame(reviews_array)
    reviews.columns = col_names
    reviews[['score']] = reviews[['score']].astype(float)
    reviews['time'] = pd.to_datetime(reviews['time'], unit='s')
    reviews["helpfulness_num"] = reviews["helpfulness"].apply(lambda x: int(x.split("/")[0]))
    reviews["helpfulness_den"] = reviews["helpfulness"].apply(lambda x: int(x.split("/")[1]))
    return reviews

In [4]:
df = load_reviews_to_df(input_path)

# Analysis

# Preprocessing

In [5]:
def select_rows(df, min_opinions):
    df["year"] = df['time'].astype('datetime64[ns]').dt.year
    df = df.loc[(df["userId"] != "unknown") & df["year"].isin([2011, 2012, 2013]), ]
    
    user_no_opinions_train = df.loc[df["year"].isin([2012, 2013]), ["userId", "productId"]].groupby("userId").count()
    users_train = user_no_opinions_train.loc[user_no_opinions_train["productId"] >= min_opinions, ].index.values
    
    user_no_opinions_nlp = df.loc[df["year"] == 2011, ["userId", "productId"]].groupby("userId").count()
    users_nlp = user_no_opinions_nlp.loc[user_no_opinions_nlp["productId"] >= 2, ].index.values
    return df.dropna().loc[(df["userId"].isin(users_train) & (df["year"] > 2011)) | 
                           (df["userId"].isin(users_nlp) & (df["year"] == 2011)),
                           ["productId", "userId", "score", "text", "helpfulness_num", "helpfulness_den", "year"]]


def confidence(ups, n):
    if n == 0:
        return 0
    z = 1.281551565545
    p = float(ups) / n
    left = p + 1 / (2 * n) * z ** 2
    right = z * sqrt(p * (1 - p) / n + z ** 2 / (4 * n ** 2))
    under = 1 + 1 / n * z ** 2
    return (left - right) / under


def clean_text(text, wnl):
    text = str(text)
    text = contractions.fix(text, slang=True)
    text = text.lower()
    text = re.sub(r"\d+", "", re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text))
    words = word_tokenize(text)
    return " ".join([wnl.lemmatize(i) for i in words])

In [6]:
df_clean = select_rows(df, min_opinions)
df_clean["wilson_score"] = df_clean.apply(
        lambda row: confidence(row["helpfulness_num"], row["helpfulness_den"]), axis=1)
wnl = WordNetLemmatizer()
df_clean["text"] = df_clean["text"].apply(clean_text, wnl=wnl)

# Data split

In [7]:
def data_split(df_clean):
    df_nlp = df_clean.loc[df_clean["year"] == 2011, ]
    df_train, df_test_val = train_test_split(df_clean.loc[df_clean["year"] > 2011, ], test_size=0.2, random_state=2022,
                                             stratify=df_clean.loc[df_clean["year"] > 2011, "userId"])
    return df_nlp, df_train, df_test_val

In [8]:
df_nlp, df_train, df_test = data_split(df_clean)

# Vectorisation

In [9]:
def bag_of_words(df_nlp, df_train, df_test):
    vectorizer = CountVectorizer()
    X_nlp = vectorizer.fit_transform(df_nlp['text'].values.tolist())
    X_train = vectorizer.transform(df_train['text'].values.tolist())
    X_test = vectorizer.transform(df_test['text'].values.tolist())
    data_nlp = np.c_[np.transpose(df_nlp["score"].values), X_nlp.toarray()]
    data_train = np.c_[np.transpose(df_train["score"].values), X_train.toarray()]
    data_test = np.c_[np.transpose(df_test["score"].values), X_test.toarray()]
    return data_nlp, data_train, data_test


def word2vec(df_nlp, df_train, df_test):
    sent_train = [str(row).split() for row in df_nlp['text']]
    phrases = Phrases(sent_train, min_count=30, progress_per=10000)
    bigram = Phraser(phrases)
    sentences = bigram[sent_train]

    w2v_model = Word2Vec(sentences=sentences, vector_size=300)
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
    
#     data_nlp = np.array([np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 300) for word_tokens in sent_train])
#     data_nlp = np.c_[np.transpose(df_nlp["score"].values), data_nlp]
    for i in range(len(sent_train)):
        word_tokens = sent_train[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 300)
        if i == 0:
            data_nlp = words_mean
        else:
            data_nlp = np.concatenate((data_nlp, words_mean))

    data_nlp = np.c_[np.transpose(df_nlp["score"].values), data_nlp]

    # train
    sent_test = [row.split() for row in df_train['text']]
    for i in range(len(sent_test)):
        word_tokens = sent_test[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 300)
        if i == 0:
            data_train = words_mean
        else:
            data_train = np.concatenate((data_train, words_mean))

    data_train = np.c_[np.transpose(df_train["score"].values), data_train]

    # test
    sent_test = [row.split() for row in df_test['text']]
    for i in range(len(sent_test)):
        word_tokens = sent_test[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 300)
        if i == 0:
            data_test = words_mean
        else:
            data_test = np.concatenate((data_test, words_mean))

    data_test = np.c_[np.transpose(df_test["score"].values), data_test]

    return data_nlp, data_train, data_test


def TFIDF(df_nlp, df_train, df_test):
    vectorizer = TfidfVectorizer()
    X_nlp = vectorizer.fit_transform(df_nlp['text'].values.tolist())
    X_train = vectorizer.transform(df_train['text'].values.tolist())
    X_test = vectorizer.transform(df_test['text'].values.tolist())
    data_nlp = np.c_[np.transpose(df_nlp["score"].values), X_nlp.toarray()]
    data_train = np.c_[np.transpose(df_train["score"].values), X_train.toarray()]
    data_test = np.c_[np.transpose(df_test["score"].values), X_test.toarray()]
    return data_nlp, data_train, data_test

In [10]:
# vec_method_functions = {"bow": bag_of_words,
#                         "w2v": word2vec,
#                         "tfidf": TFIDF}

# assert vec_method in vec_method_functions.keys(), f"Unrecognised method: {vec_method}"

# data_nlp, data_train, data_test = vec_method_functions[vec_method](df_nlp, df_train, df_test)

In [11]:
data_nlp_bow, data_train_bow, data_test_bow = bag_of_words(df_nlp, df_train, df_test)
data_nlp_bow[np.isnan(data_nlp_bow)] = 0
data_train_bow[np.isnan(data_train_bow)] = 0
data_test_bow[np.isnan(data_test_bow)] = 0

In [12]:
data_nlp_TFIDF, data_train_TFIDF, data_test_TFIDF = TFIDF(df_nlp, df_train, df_test)
data_nlp_TFIDF[np.isnan(data_nlp_TFIDF)] = 0
data_train_TFIDF[np.isnan(data_train_TFIDF)] = 0
data_test_TFIDF[np.isnan(data_test_TFIDF)] = 0

In [13]:
data_nlp_w2v, data_train_w2v, data_test_w2v = word2vec(df_nlp, df_train, df_test)
data_nlp_w2v[np.isnan(data_nlp_w2v)] = 0
data_train_w2v[np.isnan(data_train_w2v)] = 0
data_test_w2v[np.isnan(data_test_w2v)] = 0

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


# Dimensionality reduction

## BoW

In [100]:
data_nlp_bow.shape

(7011, 17431)

In [101]:
scaler = StandardScaler()
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_bow[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Unnamed: 0,dim,var_explained,cumulative
0,3870,0.003318,0.003318
1,3870,0.002772,0.00609
2,3870,0.002657,0.008747
3,3870,0.002651,0.011398
4,3870,0.002616,0.014014
5,3870,0.002234,0.016247
6,3870,0.002207,0.018454
7,3870,0.002124,0.020579
8,3870,0.002094,0.022672
9,3870,0.00205,0.024723


In [102]:
pca_bow = PCA(0.9, random_state=2022)
data_bow = pca_bow.fit_transform(scaler.fit_transform(data_nlp_bow[:, 1:]))

In [103]:
neigh = KNeighborsClassifier()
neigh.fit(data_bow, data_nlp_bow[:, :1] - 1)
np.mean((neigh.predict(data_bow) - data_nlp_bow[:, 0] + 1)**2)**0.5

  return self._fit(X, y)


1.1156873383955088

In [104]:
xgb = XGBClassifier()
xgb.fit(data_bow, data_nlp_bow[:, :1] - 1)
np.mean((xgb.predict(data_bow) - data_nlp_bow[:, 0] + 1)**2)**0.5

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




0.09173520213837134

## TFIDF

In [105]:
data_nlp_TFIDF.shape

(7011, 17431)

In [106]:
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_TFIDF[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Data scaled.


Unnamed: 0,dim,var_explained,cumulative
0,4254,0.002601,0.002601
1,4254,0.002499,0.0051
2,4254,0.002066,0.007166
3,4254,0.001909,0.009075
4,4254,0.001665,0.01074
5,4254,0.001649,0.012389
6,4254,0.001533,0.013922
7,4254,0.00149,0.015412
8,4254,0.001468,0.01688
9,4254,0.001423,0.018303


In [107]:
pca_tfidf = PCA(0.9, random_state=2022)
data_tfidf = pca_tfidf.fit_transform(scaler.fit_transform(data_nlp_TFIDF[:, 1:]))

In [108]:
neigh = KNeighborsClassifier()
neigh.fit(data_tfidf, data_nlp_TFIDF[:, 0] - 1)
np.mean((neigh.predict(data_tfidf) - data_nlp_TFIDF[:, 0] + 1)**2)**0.5

1.1052835338505285

In [None]:
xgb = XGBClassifier()
xgb.fit(data_tfidf, data_nlp_TFIDF[:, 0] - 1)
np.mean((xgb.predict(data_tfidf) - data_nlp_TFIDF[:, 0] + 1)**2)**0.5





## w2v

In [18]:
data_nlp_w2v.shape

(7011, 301)

In [19]:
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_w2v[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Data scaled.


Unnamed: 0,dim,var_explained,cumulative
0,163,0.075429,0.075429
1,163,0.063458,0.138887
2,163,0.056866,0.195753
3,163,0.051799,0.247551
4,163,0.032505,0.280056
5,163,0.030738,0.310793
6,163,0.028969,0.339762
7,163,0.025636,0.365398
8,163,0.024097,0.389495
9,163,0.02116,0.410655


In [98]:
pca_w2v = PCA(0.9, random_state=2022)
data_w2v = pca_w2v.fit_transform(scaler.fit_transform(data_nlp_w2v[:, 1:]))

In [99]:
neigh = KNeighborsClassifier()
neigh.fit(data_w2v, data_nlp_bow[:, 0] - 1)
np.mean((neigh.predict(data_w2v) - data_nlp_w2v[:, 0] + 1)**2)**0.5

2.187313168254729

In [None]:
xgb = XGBClassifier()
xgb.fit(data_w2v, data_nlp_bow[:, 0] - 1)
np.mean((xgb.predict(data_w2v) - data_nlp_bow[:, 0] + 1)**2)**0.5

## final

In [20]:
def PCA_dimred(df_nlp, df_train, df_test, percentage=0.95):
    scaler = StandardScaler()
    pca = PCA(percentage, random_state=2022)
    data_nlp = pca.fit_transform(scaler.fit_transform(df_nlp[:, 1:]))
    data_train = pca.transform(scaler.transform(df_train[:, 1:]))
    data_test = pca.transform(scaler.transform(df_test[:, 1:]))
    print(f"Shape of training data reduced from {df_nlp[:, 1:].shape} to {data_nlp.shape}.")
    return np.concatenate((df_nlp[:, :1], data_nlp), 1), \
           np.concatenate((df_train[:, :1], data_train), 1), \
           np.concatenate((df_test[:, :1], data_test), 1)

In [21]:
dim_method_functions = {"pca": PCA_dimred}

# vectorise
assert dim_method in dim_method_functions.keys(), f"Unrecognised method: {dim_method}"
    
data_nlp, data_train, data_test = dim_method_functions[dim_method](data_nlp_w2v, data_train_w2v, data_test_w2v)

Shape of training data reduced from (7011, 300) to (7011, 113).


# Scores prediction

In [49]:
def xgboost(X_train, y_train, hyperparameters):
    model = XGBClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    return model

def get_random_params():
    return {
        "objective": "multi:softmax",
        "eval_metric": "merror",
        "eta": random.uniform(0.1, 0.4), 
        "gamma": random.uniform(0, 6),
        "max_depth":random.choice([i for i in range(4, 15)]),
        "subsample": random.choice([i/10 for i in range(5, 11)]),
        "colsample_bytree": random.choice([i/10 for i in range(5, 11)]),
#         "scale_pos_weight": 10**random.choice([-1, -0.5, 0, 0.5, 1]),
        "alpha": random.uniform(0, 2),
        "lambda": random.uniform(0.1, 5),  
        "max_delta_step": 10**random.uniform(0,  1), 
        'num_class': 5
    }

In [50]:
DTrain = xgb.DMatrix(data_nlp[:, 1:], data_nlp[:, :1]-1)
cv_results3 = pd.DataFrame()
for i in range(50):
    params = get_random_params()
    results = xgb.cv(params, DTrain)
    cv_results3 = cv_results3.append(pd.DataFrame(np.concatenate((np.array([i for i in params.values()]), results.mean().values))).transpose())

In [29]:
results

{'objective': 'multi:softmax',
 'eta': 0.37402652787258905,
 'gamma': 1.4220147631805014,
 'max_depth': 17,
 'subsample': 0.7,
 'colsample_bytree': 0.6,
 'alpha': 1.8795169854292686,
 'lambda': 1.7282225661718529,
 'max_delta_step': 6.837467051182085,
 'num_class': 5}

In [68]:
cv_results3.columns = ["objective", "eval_metric", "eta", "gamma", "max_depth", "subsample", "colsample_bytree", "alpha", "lambda", "max_delta_step", "num_class", "train_error", "train_error_std", "test_error", "test_error_std"]
cv_results3.index = [i for i in range(cv_results3.shape[0])]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(cv_results3.sort_values("test_error"))

Unnamed: 0,objective,eval_metric,eta,gamma,max_depth,subsample,colsample_bytree,alpha,lambda,max_delta_step,num_class,train_error,train_error_std,test_error,test_error_std
12,multi:softmax,merror,0.3895781898586559,0.9521517489652932,15,1.0,0.8,0.9654922129155727,3.1439780531545423,1.1679276528311529,5,0.0432534666666666,0.0017220662324906,0.3042932666666666,0.0097769326163488
33,multi:softmax,merror,0.1713041819518868,0.3748413176428558,15,1.0,0.9,1.917782374585211,3.83083524265389,6.790740567133205,5,0.0547354,0.0041398530799527,0.3102125666666667,0.0099722020207744
47,multi:softmax,merror,0.2660534067126462,2.341110698301225,13,1.0,0.7,1.8518625913886664,1.926889581069076,6.25236061319274,5,0.0740123333333333,0.003248209136664,0.3104834666666667,0.0084883006626016
46,multi:softmax,merror,0.2725366507557842,1.8656383106937724,15,0.7,0.9,0.3588977836478411,1.3591119785920998,2.387511467245078,5,0.0714804666666666,0.0029515608559262,0.3139638,0.0128122956861697
17,multi:softmax,merror,0.212274324926483,4.778619546143207,11,1.0,0.8,0.4262750784834246,1.357038661290502,1.4283939854562724,5,0.1381472333333333,0.0050463651408591,0.3149765,0.0100899160286889
26,multi:softmax,merror,0.3796979986439194,0.99382067282356,19,0.7,1.0,0.171940482474981,4.953353296054935,4.136499743370703,5,0.0845884666666666,0.0030529996169147,0.3166024666666666,0.0128262202989502
7,multi:softmax,merror,0.1773666009443857,2.6270155519029244,13,0.9,0.8,1.4378732659066962,3.5144395238611748,1.861187368174805,5,0.1275781,0.0028503746289923,0.3170446333333333,0.0122911883214982
25,multi:softmax,merror,0.3708082235227982,4.008964281594962,11,0.9,1.0,0.2933927560032656,4.786183597671624,1.2721866213346107,5,0.1672228333333333,0.0034643240650115,0.3200685,0.0105900777884979
23,multi:softmax,merror,0.3679647709403028,4.191685190196409,14,0.9,0.5,1.3149834969055887,1.1539195593487788,1.531836766639461,5,0.1370845666666666,0.0028854927085196,0.3200970333333333,0.0098458045215085
28,multi:softmax,merror,0.2488419701620201,0.2021337245187573,17,0.6,1.0,1.686210388272832,3.194142719265638,5.532358072660556,5,0.1162245,0.0042163751143464,0.3202253333333333,0.0107404998672247


In [33]:
cv_results3.to_csv("hiperparams6.csv", index=False)

In [70]:
best_params = cv_results3.sort_values("test_error").loc[19, ["objective", "eval_metric", "eta", "gamma", "max_depth", "subsample", "colsample_bytree", "alpha", "lambda", "max_delta_step", "num_class"]].to_dict()
best_params

{'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'eta': '0.27618425179519157',
 'gamma': '5.822865143099207',
 'max_depth': '14',
 'subsample': '0.9',
 'colsample_bytree': '0.7',
 'alpha': '1.6393375069813245',
 'lambda': '4.505987462661249',
 'max_delta_step': '2.3184676896075045',
 'num_class': '5'}

In [76]:
xgb_model = XGBClassifier(**best_params)
xgb_model.fit(data_nlp[:, 1:], data_nlp[:, 0] - 1)

XGBClassifier(alpha='1.6393375069813245', base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree='0.7',
              enable_categorical=False, eta='0.27618425179519157',
              eval_metric='merror', gamma='5.822865143099207', gpu_id=-1,
              importance_type=None, interaction_constraints='',
              lambda='4.505987462661249', learning_rate=0.276184261,
              max_delta_step='2.3184676896075045', max_depth='14',
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_class='5', num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=1.63933754, reg_lambda=4.50598764,
              scale_pos_weight=None, ...)

In [81]:
np.mean((xgb_model.predict(data_nlp[:, 1:]) + 1 - data_nlp[:, 0])**2)**0.5

0.8538969041858491

In [82]:
np.mean(xgb_model.predict(data_nlp[:, 1:]) + 1 == data_nlp[:, 0])

0.8612180858650692

In [83]:
np.mean((xgb_model.predict(data_train[:, 1:]) + 1 - data_train[:, 0])**2)**0.5

1.2918551624274552

In [84]:
np.mean(xgb_model.predict(data_train[:, 1:]) + 1 == data_train[:, 0])

0.6300510003923107

# Prepare data for recommendations

# Build recomendations

# Reccomenations for existing users

# Recomendations for new users (cold start)