In [1]:
import pandas as pd
from pathlib import Path
from math import sqrt
import contractions
import re
import gzip
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation, FactorAnalysis, KernelPCA
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import random

# Input Settings

In [2]:
min_opinions = 5 
input_path = "data/raw/Patio.txt.gz"
vec_method = "w2v"
hyperparameters = '{objective: "multi:softmax", learning_rate: 0.3}'
dim_method = 'pca'

# Load data

In [3]:
def load_reviews_to_df(input_path):
    reviews_array = []
    dictionary = {}
    with gzip.open(input_path) as raw_data:
        for review in raw_data:
            this_line = review.decode("utf-8").split(":")
            if len(this_line) > 1:
                dictionary[this_line[0]] = this_line[1].strip()
            else:
                reviews_array.append(dictionary)
                dictionary = {}

    col_names = ['productId', 'title', 'price', 'userId',
                'profileName', 'helpfulness', 'score',
                'time', 'summary', 'text']

    reviews = pd.DataFrame(reviews_array)
    reviews.columns = col_names
    reviews[['score']] = reviews[['score']].astype(float)
    reviews['time'] = pd.to_datetime(reviews['time'], unit='s')
    reviews["helpfulness_num"] = reviews["helpfulness"].apply(lambda x: int(x.split("/")[0]))
    reviews["helpfulness_den"] = reviews["helpfulness"].apply(lambda x: int(x.split("/")[1]))
    return reviews

In [4]:
df = load_reviews_to_df(input_path)

# Analysis

# Preprocessing

In [5]:
def select_rows(df, min_opinions):
    df["year"] = df['time'].astype('datetime64[ns]').dt.year
    df = df.loc[(df["userId"] != "unknown") & df["year"].isin([2011, 2012, 2013]), ]
    
    user_no_opinions_train = df.loc[df["year"].isin([2012, 2013]), ["userId", "productId"]].groupby("userId").count()
    users_train = user_no_opinions_train.loc[user_no_opinions_train["productId"] >= min_opinions, ].index.values
    
    user_no_opinions_nlp = df.loc[df["year"] == 2011, ["userId", "productId"]].groupby("userId").count()
    users_nlp = user_no_opinions_nlp.loc[user_no_opinions_nlp["productId"] >= 2, ].index.values
    return df.dropna().loc[(df["userId"].isin(users_train) & (df["year"] > 2011)) | 
                           (df["userId"].isin(users_nlp) & (df["year"] == 2011)),
                           ["productId", "userId", "score", "text", "helpfulness_num", "helpfulness_den", "year"]]


def confidence(ups, n):
    if n == 0:
        return 0
    z = 1.281551565545
    p = float(ups) / n
    left = p + 1 / (2 * n) * z ** 2
    right = z * sqrt(p * (1 - p) / n + z ** 2 / (4 * n ** 2))
    under = 1 + 1 / n * z ** 2
    return (left - right) / under


def clean_text(text, wnl):
    text = str(text)
    text = contractions.fix(text, slang=True)
    text = text.lower()
    text = re.sub(r"\d+", "", re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text))
    words = word_tokenize(text)
    return " ".join([wnl.lemmatize(i) for i in words])

In [6]:
df_clean = select_rows(df, min_opinions)
df_clean["wilson_score"] = df_clean.apply(
        lambda row: confidence(row["helpfulness_num"], row["helpfulness_den"]), axis=1)
wnl = WordNetLemmatizer()
df_clean["text"] = df_clean["text"].apply(clean_text, wnl=wnl)

# Data split

In [7]:
def data_split(df_clean):
    df_nlp = df_clean.loc[df_clean["year"] == 2011, ]
    df_train, df_test_val = train_test_split(df_clean.loc[df_clean["year"] > 2011, ], test_size=0.2, random_state=2022,
                                             stratify=df_clean.loc[df_clean["year"] > 2011, "userId"])
    return df_nlp, df_train, df_test_val

In [8]:
df_nlp, df_train, df_test = data_split(df_clean)

# Vectorisation

In [9]:
def bag_of_words(df_nlp, df_train, df_test):
    vectorizer = CountVectorizer()
    X_nlp = vectorizer.fit_transform(df_nlp['text'].values.tolist())
    X_train = vectorizer.transform(df_train['text'].values.tolist())
    X_test = vectorizer.transform(df_test['text'].values.tolist())
    data_nlp = np.c_[np.transpose(df_nlp["score"].values), X_nlp.toarray()]
    data_train = np.c_[np.transpose(df_train["score"].values), X_train.toarray()]
    data_test = np.c_[np.transpose(df_test["score"].values), X_test.toarray()]
    return data_nlp, data_train, data_test


def word2vec(df_nlp, df_train, df_test):
    sent_train = [str(row).split() for row in df_nlp['text']]
    phrases = Phrases(sent_train, min_count=30, progress_per=10000)
    bigram = Phraser(phrases)
    sentences = bigram[sent_train]

    w2v_model = Word2Vec(sentences=sentences, vector_size=1000)
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
    
#     data_nlp = np.array([np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 300) for word_tokens in sent_train])
#     data_nlp = np.c_[np.transpose(df_nlp["score"].values), data_nlp]
    for i in range(len(sent_train)):
        word_tokens = sent_train[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 1000)
        if i == 0:
            data_nlp = words_mean
        else:
            data_nlp = np.concatenate((data_nlp, words_mean))

    data_nlp = np.c_[np.transpose(df_nlp["score"].values), data_nlp]

    # train
    sent_test = [row.split() for row in df_train['text']]
    for i in range(len(sent_test)):
        word_tokens = sent_test[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 1000)
        if i == 0:
            data_train = words_mean
        else:
            data_train = np.concatenate((data_train, words_mean))

    data_train = np.c_[np.transpose(df_train["score"].values), data_train]

    # test
    sent_test = [row.split() for row in df_test['text']]
    for i in range(len(sent_test)):
        word_tokens = sent_test[i]
        words_mean = np.mean(w2v_model.wv.vectors_for_all(word_tokens).vectors, axis=0).reshape(-1, 1000)
        if i == 0:
            data_test = words_mean
        else:
            data_test = np.concatenate((data_test, words_mean))

    data_test = np.c_[np.transpose(df_test["score"].values), data_test]

    return data_nlp, data_train, data_test


def TFIDF(df_nlp, df_train, df_test):
    vectorizer = TfidfVectorizer()
    X_nlp = vectorizer.fit_transform(df_nlp['text'].values.tolist())
    X_train = vectorizer.transform(df_train['text'].values.tolist())
    X_test = vectorizer.transform(df_test['text'].values.tolist())
    data_nlp = np.c_[np.transpose(df_nlp["score"].values), X_nlp.toarray()]
    data_train = np.c_[np.transpose(df_train["score"].values), X_train.toarray()]
    data_test = np.c_[np.transpose(df_test["score"].values), X_test.toarray()]
    return data_nlp, data_train, data_test

In [10]:
# vec_method_functions = {"bow": bag_of_words,
#                         "w2v": word2vec,
#                         "tfidf": TFIDF}

# assert vec_method in vec_method_functions.keys(), f"Unrecognised method: {vec_method}"

# data_nlp, data_train, data_test = vec_method_functions[vec_method](df_nlp, df_train, df_test)

In [11]:
data_nlp_bow, data_train_bow, data_test_bow = bag_of_words(df_nlp, df_train, df_test)
data_nlp_bow[np.isnan(data_nlp_bow)] = 0
data_train_bow[np.isnan(data_train_bow)] = 0
data_test_bow[np.isnan(data_test_bow)] = 0

In [12]:
data_nlp_TFIDF, data_train_TFIDF, data_test_TFIDF = TFIDF(df_nlp, df_train, df_test)
data_nlp_TFIDF[np.isnan(data_nlp_TFIDF)] = 0
data_train_TFIDF[np.isnan(data_train_TFIDF)] = 0
data_test_TFIDF[np.isnan(data_test_TFIDF)] = 0

In [10]:
data_nlp_w2v, data_train_w2v, data_test_w2v = word2vec(df_nlp, df_train, df_test)
data_nlp_w2v[np.isnan(data_nlp_w2v)] = 0
data_train_w2v[np.isnan(data_train_w2v)] = 0
data_test_w2v[np.isnan(data_test_w2v)] = 0

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


# Dimensionality reduction

## BoW

In [100]:
data_nlp_bow.shape

(7011, 17431)

In [101]:
scaler = StandardScaler()
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_bow[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Unnamed: 0,dim,var_explained,cumulative
0,3870,0.003318,0.003318
1,3870,0.002772,0.00609
2,3870,0.002657,0.008747
3,3870,0.002651,0.011398
4,3870,0.002616,0.014014
5,3870,0.002234,0.016247
6,3870,0.002207,0.018454
7,3870,0.002124,0.020579
8,3870,0.002094,0.022672
9,3870,0.00205,0.024723


In [102]:
pca_bow = PCA(0.9, random_state=2022)
data_bow = pca_bow.fit_transform(scaler.fit_transform(data_nlp_bow[:, 1:]))

In [103]:
neigh = KNeighborsClassifier()
neigh.fit(data_bow, data_nlp_bow[:, :1] - 1)
np.mean((neigh.predict(data_bow) - data_nlp_bow[:, 0] + 1)**2)**0.5

  return self._fit(X, y)


1.1156873383955088

In [104]:
xgb = XGBClassifier()
xgb.fit(data_bow, data_nlp_bow[:, :1] - 1)
np.mean((xgb.predict(data_bow) - data_nlp_bow[:, 0] + 1)**2)**0.5

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




0.09173520213837134

## TFIDF

In [105]:
data_nlp_TFIDF.shape

(7011, 17431)

In [106]:
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_TFIDF[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Data scaled.


Unnamed: 0,dim,var_explained,cumulative
0,4254,0.002601,0.002601
1,4254,0.002499,0.0051
2,4254,0.002066,0.007166
3,4254,0.001909,0.009075
4,4254,0.001665,0.01074
5,4254,0.001649,0.012389
6,4254,0.001533,0.013922
7,4254,0.00149,0.015412
8,4254,0.001468,0.01688
9,4254,0.001423,0.018303


In [107]:
pca_tfidf = PCA(0.9, random_state=2022)
data_tfidf = pca_tfidf.fit_transform(scaler.fit_transform(data_nlp_TFIDF[:, 1:]))

In [108]:
neigh = KNeighborsClassifier()
neigh.fit(data_tfidf, data_nlp_TFIDF[:, 0] - 1)
np.mean((neigh.predict(data_tfidf) - data_nlp_TFIDF[:, 0] + 1)**2)**0.5

1.1052835338505285

In [109]:
xgb = XGBClassifier()
xgb.fit(data_tfidf, data_nlp_TFIDF[:, 0] - 1)
np.mean((xgb.predict(data_tfidf) - data_nlp_TFIDF[:, 0] + 1)**2)**0.5





0.08857096189078092

## w2v

In [11]:
data_nlp_w2v.shape

(7011, 1001)

In [19]:
pca = PCA(0.99, random_state=2022)
pca.fit(scaler.fit_transform(data_nlp_w2v[:, 1:]))
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(pd.DataFrame({"dim": pca.n_components_, "var_explained": pca.explained_variance_ratio_, "cumulative":  np.cumsum(pca.explained_variance_ratio_)}))

Data scaled.


Unnamed: 0,dim,var_explained,cumulative
0,163,0.075429,0.075429
1,163,0.063458,0.138887
2,163,0.056866,0.195753
3,163,0.051799,0.247551
4,163,0.032505,0.280056
5,163,0.030738,0.310793
6,163,0.028969,0.339762
7,163,0.025636,0.365398
8,163,0.024097,0.389495
9,163,0.02116,0.410655


In [113]:
pca_w2v = PCA(0.9, random_state=2022)
data_w2v = pca_w2v.fit_transform(scaler.fit_transform(data_nlp_w2v[:, 1:]))

In [114]:
neigh = KNeighborsClassifier()
neigh.fit(data_w2v, data_nlp_bow[:, 0] - 1)
np.mean((neigh.predict(data_w2v) - data_nlp_w2v[:, 0] + 1)**2)**0.5

1.0530772127588508

In [115]:
xgb = XGBClassifier()
xgb.fit(data_w2v, data_nlp_w2v[:, 0] - 1)
np.mean((xgb.predict(data_w2v) - data_nlp_w2v[:, 0] + 1)**2)**0.5





0.06649528677733525

## final

In [12]:
def PCA_dimred(df_nlp, df_train, df_test, percentage=0.95):
    scaler = StandardScaler()
    pca = PCA(percentage, random_state=2022)
    data_nlp = pca.fit_transform(scaler.fit_transform(df_nlp[:, 1:]))
    data_train = pca.transform(scaler.transform(df_train[:, 1:]))
    data_test = pca.transform(scaler.transform(df_test[:, 1:]))
    print(f"Shape of training data reduced from {df_nlp[:, 1:].shape} to {data_nlp.shape}.")
    return np.concatenate((df_nlp[:, :1], data_nlp), 1), \
           np.concatenate((df_train[:, :1], data_train), 1), \
           np.concatenate((df_test[:, :1], data_test), 1)

In [13]:
dim_method_functions = {"pca": PCA_dimred}

# vectorise
assert dim_method in dim_method_functions.keys(), f"Unrecognised method: {dim_method}"
    
data_nlp, data_train, data_test = dim_method_functions[dim_method](data_nlp_w2v, data_train_w2v, data_test_w2v)

Shape of training data reduced from (7011, 1000) to (7011, 111).


# Scores prediction

## knn

In [25]:
# res = pd.DataFrame()
for i in range(60, 80):
    neigh = KNeighborsClassifier(i)
    neigh.fit(data_nlp[:, 1:], data_nlp[:, 0] - 1)
    tr = np.mean((neigh.predict(data_nlp[:, 1:]) - data_nlp[:, 0] + 1)**2)**0.5
    te = np.mean((neigh.predict(data_train[:, 1:]) - data_train[:, 0] + 1)**2)**0.5
    tr_acc = np.mean(neigh.predict(data_nlp[:, 1:]) == data_nlp[:, 0] - 1)
    te_acc = np.mean(neigh.predict(data_train[:, 1:]) == data_train[:, 0] - 1)
    res = res.append({"n": i, "train_rmse": tr, "train_acc": tr_acc, "test_rmse": te, "test_acc": te_acc}, ignore_index=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(res)

Unnamed: 0,n,train_rmse,train_acc,test_rmse,test_acc
0,25.0,1.33451,0.660248,1.286377,0.569243
1,26.0,1.328833,0.660533,1.27965,0.577874
2,27.0,1.327866,0.659392,1.292918,0.571597
3,28.0,1.332424,0.655969,1.288054,0.573166
4,29.0,1.344147,0.656397,1.299274,0.567281
5,30.0,1.348597,0.651548,1.294434,0.563358
6,31.0,1.361648,0.643417,1.290488,0.559043
7,32.0,1.360286,0.645557,1.292311,0.547273
8,33.0,1.361387,0.641991,1.28988,0.559435
9,34.0,1.376494,0.639138,1.265779,0.568851


## xgb classifier

In [15]:
def xgboost(X_train, y_train, hyperparameters):
    model = XGBClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    return model

def get_random_params():
    return {
        "objective": "multi:softmax",
        "eval_metric": "merror",
        "eta": random.uniform(0.1, 0.4), 
        "gamma": random.uniform(0, 6),
        "max_depth":random.choice([i for i in range(4, 15)]),
        "subsample": random.choice([i/10 for i in range(5, 11)]),
        "colsample_bytree": random.choice([i/10 for i in range(5, 11)]),
#         "scale_pos_weight": 10**random.choice([-1, -0.5, 0, 0.5, 1]),
        "alpha": random.uniform(0, 2),
        "lambda": random.uniform(0.1, 5),  
        "max_delta_step": 10**random.uniform(0,  1), 
        'num_class': 5
    }

In [32]:
DTrain = xgb.DMatrix(data_nlp[:, 1:], data_nlp[:, :1]-1)
cv_results3 = pd.DataFrame()
for i in range(50):
    params = get_random_params()
    results = xgb.cv(params, DTrain)
    cv_results3 = cv_results3.append(pd.DataFrame(np.concatenate((np.array([i for i in params.values()]), results.mean().values))).transpose())

In [33]:
cv_results3.columns = ["objective", "eval_metric", "eta", "gamma", "max_depth", "subsample", "colsample_bytree", "alpha", "lambda", "max_delta_step", "num_class", "train_error", "train_error_std", "test_error", "test_error_std"]
cv_results3.index = [i for i in range(cv_results3.shape[0])]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(cv_results3.sort_values("test_error"))

Unnamed: 0,objective,eval_metric,eta,gamma,max_depth,subsample,colsample_bytree,alpha,lambda,max_delta_step,num_class,train_error,train_error_std,test_error,test_error_std
40,multi:softmax,merror,0.1777005383542265,1.3959078062322567,14,1.0,0.6,0.8922732962825484,2.170178919288059,2.610349987545616,5,0.0654543333333333,0.0058683912606253,0.3044501333333333,0.0119996184942661
515,multi:softmax,merror,0.1755603268414566,0.5759217579325229,12,1.0,0.6,0.5830620587390829,4.281371034453757,7.414500518837324,5,0.0954357,0.0026479079156792,0.3059335666666666,0.0096005240834788
69,multi:softmax,merror,0.3570729563334208,0.4471423905444447,13,0.9,1.0,0.5594925524122156,3.690178182353555,1.283806989069413,5,0.0612466333333333,0.0023941558478348,0.3074311999999999,0.0054932052869227
158,multi:softmax,merror,0.2242306536158315,1.003859769721175,13,1.0,0.6,0.1311365470824792,0.5978244466111935,5.932826659817184,5,0.0289545666666666,0.0024663529829281,0.3077307333333333,0.0136045642207965
104,multi:softmax,merror,0.3003086277766594,0.723390683921934,13,1.0,0.5,1.4223795591123392,1.4654798650551557,1.1158784916362958,5,0.0690627666666666,0.0054119487451655,0.3080587666666666,0.0102594305747765
584,multi:softmax,merror,0.2929448786062507,0.321387176095339,11,1.0,0.6,0.782867797150018,3.058784944012301,7.121942266671034,5,0.0717301,0.0020230162803645,0.3082157,0.0097295969747335
275,multi:softmax,merror,0.2039697765863961,0.5170011142936897,13,1.0,0.7,1.94395868159242,4.463799145892393,2.3038992143860177,5,0.0966695666666666,0.0029802187930482,0.3082441333333333,0.011365749237398
525,multi:softmax,merror,0.2517079890504688,0.8982864732001221,11,1.0,1.0,0.9326119902460548,1.9757705929675056,4.696916186166972,5,0.0687134,0.0062125136995007,0.3084296,0.0073819182671301
502,multi:softmax,merror,0.385515626773264,0.7260329460535553,12,1.0,0.6,1.456485818484334,4.363046359615909,3.411415626020681,5,0.0782770666666666,0.0010821580445689,0.3084722666666667,0.0110480083217917
391,multi:softmax,merror,0.2928421903962406,1.4776777362745273,14,1.0,0.8,1.335247094702975,2.055148009154633,2.4405211658528447,5,0.0567537,0.002054170401192,0.3085010333333333,0.0122017414012983


In [34]:
cv_results3.to_csv("hiperparams8.csv", index=False)

In [27]:
# Select MANUALLY!
best_params = cv_results3.sort_values("test_error").loc[14, ["objective", "eval_metric", "eta", "gamma", "max_depth", "subsample", "colsample_bytree", "alpha", "lambda", "max_delta_step", "num_class"]].to_dict()
best_params

{'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'eta': '0.35270376620715205',
 'gamma': '3.0073290044505807',
 'max_depth': '10',
 'subsample': '0.9',
 'colsample_bytree': '0.8',
 'alpha': '1.2216841625381327',
 'lambda': '3.9210995845294567',
 'max_delta_step': '8.300768443995084',
 'num_class': '5'}

In [16]:
best_params = {'objective': 'multi:softmax',
 'eval_metric': 'merror',
 'eta': '0.3119643156012202',
 'gamma': '0.5108370471985526',
 'max_depth': '8',
 'subsample': '01.0',
 'colsample_bytree': '0.7',
 'alpha': '0.30025391373690136',
 'lambda': '1.9941212997558861',
 'max_delta_step': '3.3662129535920737',
 'num_class': '5'}


In [17]:
xgb_model = XGBClassifier(**best_params)
xgb_model.fit(data_nlp[:, 1:], data_nlp[:, 0] - 1)



XGBClassifier(alpha='0.30025391373690136', base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree='0.7',
              enable_categorical=False, eta='0.3119643156012202',
              eval_metric='merror', gamma='0.5108370471985526', gpu_id=-1,
              importance_type=None, interaction_constraints='',
              lambda='1.9941212997558861', learning_rate=0.311964303,
              max_delta_step='3.3662129535920737', max_depth='8',
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_class='5', num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0.300253928, reg_lambda=1.99412131,
              scale_pos_weight=None, ...)

In [18]:
np.mean((xgb_model.predict(data_nlp[:, 1:]) + 1 - data_nlp[:, 0])**2)**0.5

0.06963851074967299

In [19]:
np.mean(xgb_model.predict(data_nlp[:, 1:]) + 1 == data_nlp[:, 0])

0.9968620738838967

In [20]:
np.mean((xgb_model.predict(data_train[:, 1:]) + 1 - data_train[:, 0])**2)**0.5

1.216629898178094

In [21]:
np.mean(xgb_model.predict(data_train[:, 1:]) + 1 == data_train[:, 0])

0.629266378972146

## xgb regressor

In [38]:
def reg_get_random_params():
    return {
        "eta": random.uniform(0.05, 0.4), 
        "gamma": random.uniform(0, 10),
        "max_depth":random.choice([i for i in range(4, 15)]),
        "subsample": random.choice([i/10 for i in range(5, 11)]),
        "colsample_bytree": random.choice([i/10 for i in range(5, 11)]),
        "scale_pos_weight": 10**random.choice([-1, -0.5, 0, 0.5, 1]),
        "alpha": random.uniform(0, 2),
        "lambda": random.uniform(0.1, 5),  
        "max_delta_step": 10**random.uniform(0,  1)
    }

In [46]:
rres = pd.DataFrame()
for  i in range(20):
    params =  reg_get_random_params()
    reg = xgb.XGBRegressor(**params)
    reg.fit(data_nlp[:, 1:], data_nlp[:, :1])
    tr = np.mean((reg.predict(data_nlp[:, 1:]) - data_nlp[:, 0] )**2)**0.5
    te = np.mean((reg.predict(data_train[:, 1:]) - data_train[:, 0])**2)**0.5
    tr_acc = np.mean(np.round(reg.predict(data_nlp[:, 1:])) == data_nlp[:, 0] )
    te_acc = np.mean(np.round(reg.predict(data_train[:, 1:])) == data_train[:, 0])
    rres = rres.append(pd.DataFrame(np.concatenate((np.array([i for i in params.values()]), np.array([tr, tr_acc, te, te_acc])))).transpose())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(rres)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.361159,5.060199,10.0,0.8,0.8,0.1,0.691974,3.704083,1.821338,0.854465,0.716446,1.137285,0.387603
0,0.109759,7.76659,11.0,0.7,0.9,1.0,0.312529,3.713757,1.232221,0.559475,0.693196,1.093119,0.289133
0,0.230742,2.269332,10.0,0.9,1.0,0.316228,0.947329,1.01744,3.193653,0.434415,0.875196,1.091071,0.326795
0,0.243494,3.513866,11.0,0.7,0.7,0.1,0.972906,3.890237,1.70587,0.817601,0.754529,1.114241,0.405257
0,0.357862,6.543125,6.0,0.9,0.8,0.1,1.565252,0.308004,6.618739,0.911924,0.672372,1.121898,0.397411
0,0.143986,6.652991,11.0,0.5,0.6,0.1,1.929894,4.261232,1.016518,0.970946,0.634432,1.073888,0.407611
0,0.289797,2.8187,9.0,1.0,0.8,1.0,0.390381,0.429072,6.287619,0.343729,0.856939,1.269417,0.276579
0,0.257181,2.739808,5.0,0.6,0.8,0.1,0.09855,4.885693,7.696038,0.842133,0.728427,1.124923,0.384464
0,0.342356,0.683057,14.0,0.7,0.7,1.0,0.931943,1.137919,1.602675,0.178479,0.981458,1.167213,0.244802
0,0.371623,4.440987,14.0,0.6,0.7,0.316228,0.436725,2.971602,1.225718,0.622746,0.755812,1.155442,0.426442


In [47]:
reg_best_params = {
        "eta": 0.124064, 
        "gamma": 8.714290,
        "max_depth": 5,
        "subsample": 1,
        "colsample_bytree": 0.5,
        "scale_pos_weight": 0.316228,
        "alpha": 0.086300,
        "lambda": 0.926369,  
        "max_delta_step": 6.679537
    }

In [48]:
reg = xgb.XGBRegressor(**reg_best_params)
reg.fit(data_nlp[:, 1:], data_nlp[:, :1])
tr = np.mean((reg.predict(data_nlp[:, 1:]) - data_nlp[:, 0])**2)**0.5
te = np.mean((reg.predict(data_train[:, 1:]) - data_train[:, 0])**2)**0.5
tr_acc = np.mean(np.round(reg.predict(data_nlp[:, 1:])) == data_nlp[:, 0])
te_acc = np.mean(np.round(reg.predict(data_train[:, 1:])) == data_train[:, 0])

In [49]:
[tr, tr_acc, te, te_acc]

[0.8840578423282942, 0.595350164027956, 1.035058110420857, 0.3617104746959592]

In [51]:
np.mean((reg.predict(data_train[:, 1:]) - data_train[:, 0]) > 1.5)

0.0663005100039231

## final

In [22]:
model = KNeighborsClassifier(73)
model.fit(data_nlp[:, 1:], data_nlp[:, 0] - 1)

KNeighborsClassifier(n_neighbors=41)

In [23]:
neigh = KNeighborsClassifier(41)
neigh.fit(data_nlp[:, 1:], data_nlp[:, 0] - 1)
tr = np.mean((neigh.predict(data_nlp[:, 1:]) - data_nlp[:, 0] + 1)**2)**0.5
te = np.mean((neigh.predict(data_train[:, 1:]) - data_train[:, 0] + 1)**2)**0.5
tr_acc = np.mean(neigh.predict(data_nlp[:, 1:]) == data_nlp[:, 0] - 1)
te_acc = np.mean(neigh.predict(data_train[:, 1:]) == data_train[:, 0] - 1)
res = res.append({"n": i, "train_rmse": tr, "train_acc": tr_acc, "test_rmse": te, "test_acc": te_acc}, ignore_index=True)

In [24]:
{"n": 41, "train_rmse": tr, "train_acc": tr_acc, "test_rmse": te, "test_acc": te_acc}

{'n': 41,
 'train_rmse': 1.3996657913535573,
 'train_acc': 0.6330052774211953,
 'test_rmse': 1.3102487043765843,
 'test_acc': 0.5610043154178109}

In [None]:
df_train["score_org"] = df_train["score"]
df_train["score"] = xgb_model.predict(data_train[:, 1:]) + 1
df_train.to_csv("Patio_NLP.csv")
df_test["score_NLP"] = xgb_model.predict(data_test[:, 1:]) + 1
df_test.to_csv("Patio_test.csv")

# Prepare data for recommendations

# Build recomendations

# Reccomenations for existing users

In [72]:
df_train

Unnamed: 0,productId,userId,score,text,helpfulness_num,helpfulness_den,year,wilson_score,score_org
178432,B000E95052,A1PA447KK26BED,5.0,i guess i did not realize how flimsy it iscert...,0,0,2013,0.000000,5.0
34358,B000PYH1LW,A2VUD7UTAEQXS7,5.0,like the cherry tomato plant that i purchased ...,0,0,2012,0.000000,5.0
71845,B0002ZINDY,A3HPCRD9RX351S,5.0,i get no joy out of killing them but the vole ...,1,1,2012,0.378448,5.0
168485,B000E94ZYO,AT8VUVCBDZ2YS,5.0,be detailed and specific what would you have w...,0,1,2012,0.000000,5.0
62129,B000GBITBK,A9M8HAYEBCPBQ,5.0,i am very proud of my military service best fr...,0,0,2012,0.000000,5.0
...,...,...,...,...,...,...,...,...,...
22226,B000E3Z0EY,A388ZNQ1RHG25I,5.0,this is a good rack at a good price i wish the...,0,0,2013,0.000000,5.0
176247,B000B7RCQW,A3MUO47CT6EQF8,5.0,it is a good standard copper plug used in ton ...,4,4,2012,0.708921,5.0
111944,B000A5AZKK,A3PEGWH4ZJITEE,5.0,these made my outdoor iron wrung patio chair p...,1,1,2012,0.378448,5.0
62147,B000E971SQ,AT8VUVCBDZ2YS,5.0,be detailed and specific what would you have w...,0,1,2012,0.000000,5.0


# Recomendations for new users (cold start)

In [89]:
df_train["score_scaled"] = df_train["score"] * df_train["wilson_score"]
sums = df_train[["score_scaled", "wilson_score", "productId"]].groupby("productId").sum()

In [90]:
best_new_products = (sums["score_scaled"] / sums["wilson_score"]).sort_values(ascending=False)[:100].index

In [91]:
best_new_products

Index(['B0000950PZ', 'B000F0E8JE', 'B000EW6EGI', 'B000EUNAW6', 'B000EHLTIQ',
       'B000E7OYNI', 'B000E7J80W', 'B000E7I240', 'B000E7GMMO', 'B00002N67T',
       'B000E5R4AA', 'B000E199HA', 'B000E157X0', 'B000DZH3XO', 'B000DEN8DY',
       'B000DCN8HW', 'B000CZ30C8', 'B000E7EKY6', 'B000F0K0XC', 'B000F0K0XW',
       'B000F3DUW2', 'B000F6U5UO', 'B000F97DWO', 'B000F97DXS', 'B000F97DZG',
       'B000F9H9GO', 'B000FCPDFA', 'B000FJRS06', 'B000FJRUK4', 'B000FK2DNM',
       'B000FLV9H2', 'B000FPVN4M', 'B000FPVN7Y', 'B000FPVNB0', 'B000CSPI48',
       'B000CSPI3E', 'B000FPWVJI', 'B000BQ81A4', 'B000BQROHU', 'B000BQQ82W',
       'B000BQPZLC', 'B000BQPGQQ', 'B000BQNBXQ', 'B000BQK6ES', 'B000BPQND2',
       'B000BQU75Q', 'B000BPF27A', 'B000BPASBK', 'B000BOC2E2', 'B000BGODPQ',
       'B000BGHE84', 'B000B9PS1G', 'B00002N680', 'B000BQW9KC', 'B000CSKN8Y',
       'B000C1Z2VE', 'B000CSKN8O', 'B000CSJ1VE', 'B000CIU726', 'B000CIU6R2',
       'B000CIADLG', 'B000C210KU', 'B000BX1IB6', 'B000BQWP0Q', 'B000BWFESU',