###Import Libraries and set directory



In [341]:
# install StellarGraph if running on Google Colab
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1
  # !pip install stanfordcorenlp
  # !pip install git+https://github.com/leonardozago/pybabelfy.git
  # !pip install levenshtein
  # !pip install scispacy
  # !pip install contractions
  # !pip install unidecode

In [342]:
# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg

try:
    sg.utils.validate_notebook_version('1.2.1')
except AttributeError:
    raise ValueError(
        f'This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>.'
    ) from None

In [343]:
import networkx as nx
import pandas as pd
import numpy as np
import os
import random
import shutil
import requests
import gzip
# import unidecode
import csv
import re

import subprocess
from subprocess import Popen

import stellargraph as sg
from stellargraph.data import UnsupervisedSampler
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec, link_classification

from tensorflow import keras

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [344]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Load data source (Musical Instruments review from Amazon)

Data source: *https://jmcauley.ucsd.edu/data/amazon/*

Choosing work directory

In [345]:
data_samples_dir = '/content/drive/MyDrive/TCC_Leonardo_Zago/data_samples'

Import data from the reviews

In [346]:
def clean_text(text):
  text = text.replace('\n', ' ').replace('\t', ' ').replace('  ', ' ').replace('"', '')
  text = re.sub(r'[^a-zA-Z0-9.!?,; ]+', '', text)
  text = re.sub(r'[\.!?,;]+', '.', text)
  text = text.replace('.', '. ')
  text = re.sub(r'\s+', ' ', text)
  text = text.replace(' .', '.')
  text = re.sub(r'\.+', '.', text)

  if text[-1] not in ['.', ' ']:
      text = text + '.'
  elif text[-1] == ' ' and text[-2] != '.':
    text = text[:-1] + '.'
  elif text[-1] == ' ' and text[-2] == '.':
    text = text[:-1]
  elif text[-2:] == ' .':
    text = text[:-2] + '.'

  text_sentences = [sentence.capitalize() for sentence in text.split('. ')]
  
  text = '. '.join(text_sentences)

  
  return text

In [347]:
def clean_text(text):
  text = text.replace('\n', ' ').replace('\t', ' ').replace('  ', ' ').replace('"', '')
  text = re.sub(r'[^a-zA-Z0-9.!?,; ]+', '', text)
  text = re.sub(r'[\.!?;]+', '.', text)
  text = re.sub(r'[\,]+', ',', text)
  text = text.replace('.', '. ').replace(',', ', ')
  text = re.sub(r'\s+', ' ', text)

  while text[0] in (['.', ',', ' ']):
    text = text[1:]

  text = text.replace(' .', '.')
  text = re.sub(r'\.+', '.', text)
  text = text.replace(' ,', ',')
  text = re.sub(r'\,+', ',', text)

  if text[-1] not in ['.', ' ']:
      text = text + '.'
  elif text[-1] == ' ' and text[-2] != '.':
    text = text[:-1] + '.'
  elif text[-1] == ' ' and text[-2] == '.':
    text = text[:-1]
  elif text[-2:] == ' .':
    text = text[:-2] + '.'

  text = text.replace(',.', '.').replace('.,', '.')
  text = re.sub(r'\.+', '.', text)

  text_sentences = [sentence.capitalize() for sentence in text.split('. ')]
  
  text = '. '.join(text_sentences)

  
  return text

In [348]:
for json_file in os.listdir(data_samples_dir):
  category = json_file.split('_5')[0]
  if '_5.json.gz' in json_file and 'Video_Games' in json_file:
    with gzip.open(os.path.join(data_samples_dir, json_file), 'rb') as gz:
      reviews_df = pd.read_json(gz, lines=True)
    reviews_df = reviews_df.dropna(subset=['overall', 'reviewText'])
    reviews_df = reviews_df.drop_duplicates(subset=['overall', 'reviewText'])

    # Selecting only negative reviews (overall rating 1) between 2 and 40 words
    filtered_reviews_df = reviews_df.loc[(reviews_df['reviewText'].str.split().apply(len) > 2) & (reviews_df['reviewText'].str.split().apply(len) < 40)]
    filtered_reviews_df = pd.concat([filtered_reviews_df.loc[filtered_reviews_df['overall'] == 5].sample(500), filtered_reviews_df.loc[filtered_reviews_df['overall'] == 1].sample(500)])

    # Merging all reviews from a category and saving data into .txt files in directory
    filtered_reviews_df['reviewText'] = filtered_reviews_df.apply(lambda x: clean_text(x['reviewText']), axis=1)


    # filtered_reviews_df = filtered_reviews_df.head(1000)

    # merged_text = filtered_reviews_df['reviewText'].str.cat(sep='\n')
    # with open('{}/unstructured_reviews/review_{}_test_1116.txt'.format(data_samples_dir, category), 'w', encoding='utf-8') as f:
    #   f.write(merged_text)

    # filtered_reviews_df['reviewText'].to_csv(r'{}/unstructured_reviews/review_{}.txt'.format(data_samples_dir, category), header=None, index=None, sep=',', mode='w', quoting=csv.QUOTE_NONE, quotechar='',  escapechar=' ')

# Model analysis using Knowledge Graph (KGen) method

## Generate Knowledge Graph (KGen) from unstructured text source

Migrating files to KGen dir

In [349]:
unstructured_reviews_dir = data_samples_dir + '/unstructured_reviews'
kgen_samples_dir = '/content/drive/MyDrive/TCC_Leonardo_Zago/KGen_master/preprocessor'

for txt in os.listdir(data_samples_dir + '/unstructured_reviews'):
  if '.txt' in txt:
    shutil.copyfile(os.path.join(unstructured_reviews_dir, txt), os.path.join(kgen_samples_dir, txt))

Start CoreNLP server

In [350]:
# %cd '/content/drive/MyDrive/TCC_Leonardo_Zago/KGen_master'
# !python3 common/stanfordcorenlp/server.py -c

Run KGen-Master feature to extract triples from unstructured reviews

In [351]:
# %cd '/content/drive/MyDrive/TCC_Leonardo_Zago/KGen_master'
# !python3 pipeline.py review_Gift_Cards.txt -p senna -s -ng

In [352]:
# os.chdir('/content/drive/MyDrive/TCC_Leonardo_Zago/KGen_master')
# print(os.listdir('./'))
# for item in os.listdir(unstructured_reviews_dir):
#   if '.txt' in item and 'review_' in item:
#     try:
#       command = 'python pipeline.py {} -p senna -s -ng'.format(item)
#       print(command)
#       process = subprocess.run(command, shell=True)
#       # process.wait()
#       print(process.returncode)
#     except Exception as e:
#       print(e)
#       break

Stop CoreNLP Server

In [353]:
# %cd '/content/drive/MyDrive/TCC_Leonardo_Zago/KGen_master'
# !python3 common/stanfordcorenlp/server.py -k

## Convert KGen generated Triples to DBLP data

Moving extracted triples to data directory

In [354]:
data_dir_triples = data_samples_dir + '/triples'

for txt in os.listdir(kgen_samples_dir):
  if '_preprocessed_triples.txt' in txt:
    shutil.copyfile(os.path.join(kgen_samples_dir, txt), os.path.join(data_dir_triples,txt))

Cleansing data from triples extraction

In [355]:
triples_df = pd.read_csv(
    os.path.join(data_dir_triples, 'review_Video_Games_test_1115_preprocessed_triples.txt'),
    sep='\t',
    header=None,
    names=['vertex', 'source', 'edge', 'target'],
)

triples_df = triples_df.dropna()
# triples_df = triples_df.drop(triples_df.index[len(triples_df['target'].str.split(expand=True)) = 1], inplace = True)
source_list = list(pd.concat([triples_df['source'], triples_df['target']]).drop_duplicates())

triples_df['source_index'] = triples_df.apply(lambda row: source_list.index(row['source']), axis=1)
triples_df['target_index'] = triples_df.apply(lambda row: source_list.index(row['target']), axis=1)
triples_df = triples_df[['source_index', 'target_index', 'vertex', 'source', 'edge', 'target']].sort_values('source_index').reset_index(drop=True)

display(triples_df.head())

Unnamed: 0,source_index,target_index,vertex,source,edge,target
0,0,20,s2236,it nhl 14 with bergeron on the box,rdf:subject,it
1,0,4267,s2236,it nhl 14 with bergeron on the box,rdf:predicate,nhl
2,0,4268,s2236,it nhl 14 with bergeron on the box,rdf:object,14
3,0,4269,s2236,it nhl 14 with bergeron on the box,local:AM-LOC,box
4,0,4270,s2236,it nhl 14 with bergeron on the box,local:AM-MNR,bergeron


Get number of nodes in KGen graph

In [356]:
n_nodes = len(source_list)
n_nodes

6086

Applying CountVectorizer() to convert text sources to a matrix of token counts

In [357]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(source_list) # Ao invés de source_list passar uma lista com todas as reviews selecionadas após o tratamento
vectorizer.get_feature_names_out()

vectors = X.toarray()

feature_names = ['w_{}'.format(ii) for ii in range(len(vectors[0]))]
node_data = pd.DataFrame(vectors, columns=feature_names)

display(node_data.head())

Unnamed: 0,w_0,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,...,w_2190,w_2191,w_2192,w_2193,w_2194,w_2195,w_2196,w_2197,w_2198,w_2199
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Getting edgelist with the relations between text sources and targets

In [358]:
edgelist = triples_df[['source_index', 'target_index']].rename(columns={'source_index': 'source', 'target_index': 'target'})
edgelist['label'] = 'cites'  # set the edge type

print(edgelist.shape)
display(edgelist.head())

(11806, 3)


Unnamed: 0,source,target,label
0,0,20,cites
1,0,4267,cites
2,0,4268,cites
3,0,4269,cites
4,0,4270,cites



## Link prediction via inductive node representations with attri2vec

This demo notebook demonstrates how to perform link prediction for out-of-sample nodes through learning node representations inductively with attri2vec [1]. The implementation uses the stellargraph components.

<a name='refs'></a>
**References:** 

[1] [Attributed Network Embedding via Subspace Discovery](https://link.springer.com/article/10.1007/s10618-019-00650-2). D. Zhang, Y. Jie, X. Zhu and C. Zhang. Data Mining and Knowledge Discovery, 2019. 

### attri2vec

attri2vec learns node representations by performing a linear/non-linear mapping on node content attributes. To make the learned node representations respect structural similarity, [DeepWalk](https://dl.acm.org/citation.cfm?id=2623732)/[Node2Vec](https://snap.stanford.edu/node2vec) learning mechanism is used to make nodes sharing similar random walk context nodes represented closely in the subspace, which is achieved by maximizing the occurrence probability of context nodes conditioned on the representation of the target nodes. 

In this demo, we first train the attri2vec model on the in-sample subgraph and obtain a mapping function from node attributes to node representations, then apply the mapping function to the content attributes of out-of-sample nodes and obtain the representations of out-of-sample nodes. We evaluate the quality of inferred out-of-sample node representations by using it to predict the links of out-of-sample nodes.

### Loading DBLP network data

Construct the whole graph from edge list.

In [359]:
G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr='label')

Specify node types.

In [360]:
nx.set_node_attributes(G_all_nx, 'paper', 'label')

Get node features.

In [361]:
all_node_features = node_data[feature_names]

Create the Stellargraph with node features.

In [362]:
G_all = sg.StellarGraph.from_networkx(G_all_nx, node_features=all_node_features)

In [363]:
print(G_all.info())

StellarGraph: Undirected multigraph
 Nodes: 6086, Edges: 9406

 Node types:
  paper: [6086]
    Features: float32 vector, length 2200
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [9406]
        Weights: all 1 (default)
        Features: none


### Get DBLP Subgraph 
#### with papers published before a threshold year

Get the edge list connecting in-sample nodes.

In [364]:
subgraph_edgelist = edgelist.sample(frac=0.3).reset_index(drop=True) # Get a subgraph from an edgelist sample
display(subgraph_edgelist.head())

Unnamed: 0,source,target,label
0,793,4319,cites
1,554,4675,cites
2,3548,3547,cites
3,4123,4976,cites
4,1158,1159,cites


Construct the network from the selected edge list.

In [365]:
G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist, edge_attr='label')

Specify node types.

In [366]:
nx.set_node_attributes(G_sub_nx, 'paper', 'label')

Get the ids of the nodes in the selected subgraph.

In [367]:
subgraph_node_ids = sorted(list(G_sub_nx.nodes))

Get the node features of the selected subgraph.

In [368]:
subgraph_node_features = node_data[feature_names].reindex(subgraph_node_ids)

Create the Stellargraph with node features.

In [369]:
G_sub = sg.StellarGraph.from_networkx(G_sub_nx, node_features=subgraph_node_features)

In [370]:
print(G_sub.info())

StellarGraph: Undirected multigraph
 Nodes: 3566, Edges: 3126

 Node types:
  paper: [3566]
    Features: float32 vector, length 2200
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [3126]
        Weights: all 1 (default)
        Features: none


### Train attri2vec on the DBLP Subgraph

Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk.

In [371]:
nodes = list(G_sub.nodes())
number_of_walks = 2
length = 5

Create the UnsupervisedSampler instance with the relevant parameters passed to it.

In [372]:
unsupervised_samples = UnsupervisedSampler(
    G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks
)

Set the batch size and the number of epochs. 

In [373]:
batch_size = 50
epochs = 6

Define an attri2vec training generator, which generates a batch of (feature of target node, index of context node, label of node pair) pairs per iteration.

In [374]:
generator = Attri2VecLinkGenerator(G_sub, batch_size)

Building the model: a 1-hidden-layer node representation ('input embedding') of the `target` node and the parameter vector ('output embedding') for predicting the existence of `context node` for each `(target context)` pair, with a link classification layer performed on the dot product of the 'input embedding' of the `target` node and the 'output embedding' of the `context` node.

attri2vec part of the model, with a 128-dimension hidden layer, no bias term and no normalization. (Normalization can be set to 'l2'). 

In [375]:
layer_sizes = [128]
attri2vec = Attri2Vec(
    layer_sizes=layer_sizes, generator=generator, bias=False, normalize=None
)

In [376]:
# Build the model and expose input and output sockets of attri2vec, for node pair inputs:
x_inp, x_out = attri2vec.in_out_tensors()

Use the link_classification function to generate the prediction, with the `ip` edge embedding generation method and the `sigmoid` activation, which actually performs the dot product of the 'input embedding' of the target node and the 'output embedding' of the context node followed by a sigmoid activation. 

In [377]:
prediction = link_classification(
    output_dim=1, output_act='sigmoid', edge_embedding_method='ip'
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


Stack the attri2vec encoder and prediction layer into a Keras model, and specify the loss.

In [378]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-2),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

  super(Adam, self).__init__(name, **kwargs)


Train the model.

In [379]:
history = model.fit(
    generator.flow(unsupervised_samples),
    epochs=epochs,
    verbose=2,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
)

Epoch 1/6
1142/1142 - 13s - loss: 0.9514 - binary_accuracy: 0.5114 - 13s/epoch - 12ms/step
Epoch 2/6
1142/1142 - 10s - loss: 0.7408 - binary_accuracy: 0.5955 - 10s/epoch - 9ms/step
Epoch 3/6
1142/1142 - 10s - loss: 0.5110 - binary_accuracy: 0.7609 - 10s/epoch - 9ms/step
Epoch 4/6
1142/1142 - 10s - loss: 0.3072 - binary_accuracy: 0.8751 - 10s/epoch - 9ms/step
Epoch 5/6
1142/1142 - 12s - loss: 0.1910 - binary_accuracy: 0.9335 - 12s/epoch - 10ms/step
Epoch 6/6
1142/1142 - 10s - loss: 0.1346 - binary_accuracy: 0.9553 - 10s/epoch - 9ms/step


### Predicting links of out-of-sample nodes with the learned attri2vec model

Build the node based model for predicting node representations from node content attributes with the learned parameters. Below a Keras model is constructed, with `x_inp[0]` as input and `x_out[0]` as output. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier.

In [380]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

Get the node embeddings, for both in-sample and out-of-sample nodes, by applying the learned mapping function to node content features.

In [381]:
node_ids = node_data.index
node_gen = Attri2VecNodeGenerator(G_all, batch_size).flow(node_ids)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)



Get the positive and negative edges for in-sample nodes and out-of-sample nodes. The edges of the in-sample nodes only include the edges between in-sample nodes, and the edges of out-of-sample nodes are referred to all the edges linked to out-of-sample nodes, including the edges connecting in-sample and out-of-sample edges.

In [382]:
in_sample_edges = []
out_of_sample_edges = []

in_sample_edges_df = subgraph_edgelist
out_of_sample_edges_df = pd.concat([edgelist, in_sample_edges_df]).drop_duplicates(keep=False).reset_index(drop=True)

for ii in range(len(in_sample_edges_df)):
    source_index = in_sample_edges_df['source'][ii]
    target_index = in_sample_edges_df['target'][ii]
    if source_index > target_index:  # neglect edge direction for the undirected graph
        continue
    in_sample_edges.append([source_index, target_index, 1])  # get the positive edge
    negative_target_index = unsupervised_samples.np_random.choice(
        node_data.index.tolist(), size=1
    )  # generate negative node
    in_sample_edges.append(
        [source_index, negative_target_index[0], 0]
    )  # get the negative edge

for iii in range(len(out_of_sample_edges_df)):
    source_index = out_of_sample_edges_df['source'][iii]
    target_index = out_of_sample_edges_df['target'][iii]
    if source_index > target_index:  # neglect edge direction for the undirected graph
        continue
    out_of_sample_edges.append([source_index, target_index, 1])  # get the positive edge
    negative_target_index = unsupervised_samples.np_random.choice(
        node_data.index.tolist(), size=1
    )  # generate negative node
    out_of_sample_edges.append(
        [source_index, negative_target_index[0], 0]
    )  # get the negative edge

in_sample_edges = np.array(in_sample_edges)
out_of_sample_edges = np.array(out_of_sample_edges)



Construct the edge features from the learned node representations with l2 normed difference, where edge features are the element-wise square of the difference between the embeddings of two head nodes. Other strategy like element-wise product can also be used to construct edge features.

In [383]:
in_sample_edge_feat_from_emb = (
    node_embeddings[in_sample_edges[:, 0]] - node_embeddings[in_sample_edges[:, 1]]
) ** 2
out_of_sample_edge_feat_from_emb = (
    node_embeddings[out_of_sample_edges[:, 0]]
    - node_embeddings[out_of_sample_edges[:, 1]]
) ** 2

Train the Logistic Regression classifier from in-sample edges with the edge features constructed from attri2vec embeddings. 

In [384]:
clf_edge_pred_from_emb = LogisticRegression(
    verbose=0, solver='lbfgs', multi_class='auto', max_iter=500
)
clf_edge_pred_from_emb.fit(in_sample_edge_feat_from_emb, in_sample_edges[:, 2])

LogisticRegression(max_iter=500)

Predict the edge existence probability with the trained Logistic Regression classifier.

In [385]:
edge_pred_from_emb = clf_edge_pred_from_emb.predict_proba(
    out_of_sample_edge_feat_from_emb
)

Get the positive class index of `edge_pred_from_emb`.

In [387]:
if clf_edge_pred_from_emb.classes_[0] == 1:
    positive_class_index = 0
else:
    positive_class_index = 1

Evaluate the AUC score for the prediction with attri2vec embeddings.

In [388]:
roc_auc_score(out_of_sample_edges[:, 2], edge_pred_from_emb[:, positive_class_index])

0.8624479968537587

As the baseline, we also investigate the performance of node content features in predicting the edges of out-of-sample nodes. Firstly, we construct edge features from node content features with the same strategy.

In [389]:
in_sample_edge_rep_from_feat = (
    node_data[feature_names].values[in_sample_edges[:, 0]]
    - node_data[feature_names].values[in_sample_edges[:, 1]]
) ** 2
out_of_sample_edge_rep_from_feat = (
    node_data[feature_names].values[out_of_sample_edges[:, 0]]
    - node_data[feature_names].values[out_of_sample_edges[:, 1]]
) ** 2

Then we train the Logistic Regression classifier from in-sample edges with the edge features constructed from node content features.

In [390]:
clf_edge_pred_from_feat = LogisticRegression(
    verbose=0, solver='lbfgs', multi_class='auto', max_iter=500
)
clf_edge_pred_from_feat.fit(in_sample_edge_rep_from_feat, in_sample_edges[:, 2])

LogisticRegression(max_iter=500)

Predict the edge existence probability with the trained Logistic Regression classifier.

In [391]:
edge_pred_from_feat = clf_edge_pred_from_feat.predict_proba(
    out_of_sample_edge_rep_from_feat
)

Get positive class index of `clf_edge_pred_from_feat`.

In [392]:
if clf_edge_pred_from_feat.classes_[0] == 1:
    positive_class_index = 0
else:
    positive_class_index = 1

Evaluate the AUC score for the prediction with node content features.

In [393]:
roc_auc_score(out_of_sample_edges[:, 2], edge_pred_from_feat[:, positive_class_index])

0.8384079240563888

attri2vec can inductively infer the representations of out-of-sample nodes from their content attributes. As the inferred node representations well capture both structure and node content information, they perform much better than node content features in predicting the links of out-of-sample nodes.

Generate node embeddings for each review

In [394]:
unstructured_reviews_df = pd.read_table(
                          os.path.join(unstructured_reviews_dir, 'review_Video_Games_test_1115.txt'),
                          sep='\n',
                          header=None,
                          names=['reviewText']
)
unstructured_reviews_df['overall'] = 1
unstructured_reviews_df.iloc[0:400]['overall'] = 5

# unstructured_reviews_df = filtered_reviews_df[['overall', 'reviewText']].reset_index(drop=True)

unstructured_reviews_df['nodes'] = unstructured_reviews_df.apply(lambda x: [node_index for node_index in range(len(source_list)) if source_list[node_index] in x['reviewText'].casefold().replace(',', '')], axis=1)
unstructured_reviews_df['embeddings'] = unstructured_reviews_df.apply(lambda x: np.mean(node_embeddings[x['nodes']], axis=0), axis=1)

# Dropping reviews without node match
unstructured_reviews_df = unstructured_reviews_df.loc[unstructured_reviews_df['nodes'].str.len() > 0].reset_index(drop=True)

unstructured_reviews_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,reviewText,overall,nodes,embeddings
0,My son loves it.,5,"[20, 528, 593, 3690, 3923, 4433, 4557, 4620, 5...","[0.21755254, 0.28294075, 0.27559268, 0.2819338..."
1,Great game. Too bad its only on ps4.,5,"[20, 593, 733, 4319, 4433, 4477, 4497, 4550, 4...","[0.30226773, 0.431578, 0.40285388, 0.37400818,..."
2,"Must have. Extends range of motion, meaning yo...",5,"[77, 518, 593, 1531, 3246, 3512, 4261, 4262, 4...","[0.25158864, 0.41790769, 0.30023387, 0.2641289..."
3,"Ive been playing for a few hours, and its quit...",5,"[20, 593, 686, 1531, 1607, 3128, 3512, 4254, 4...","[0.24904002, 0.37419638, 0.2793455, 0.22546181..."
4,One of the best gifts ibought my son. He loves...,5,"[20, 528, 593, 1791, 1910, 3512, 3690, 4250, 4...","[0.18062823, 0.46781963, 0.3110857, 0.22368644..."
...,...,...,...,...
795,S didnt work u lucky i aknt send it back. Sold...,1,"[5, 8, 9, 20, 528, 593, 1531, 3512, 4274, 4278...","[0.20780891, 0.40604806, 0.31774, 0.3073579, 0..."
796,Loses signal often. Switched back to wired mouse.,1,"[1, 2, 4, 20, 593, 3512, 4271, 4272, 4273, 427...","[0.30816975, 0.38788965, 0.3783503, 0.36477047..."
797,Its nhl 14 with bergeron on the box.,1,"[20, 593, 3512, 4267, 4268, 4269, 4270, 4433, ...","[0.29358768, 0.42125183, 0.45409605, 0.2781963..."
798,"Bad, reply to customer service no one anwswer.",1,"[593, 2901, 4322, 4433, 4497, 4598, 4620, 4994...","[0.40007222, 0.46482074, 0.43260717, 0.4206809..."


Get average number of words of each review

In [None]:
def n_words(text):
  aux_list = text.replace('.', '').split(' ')
  return len(aux_list)

avg_words = np.mean(unstructured_reviews_df.apply(lambda x: n_words(x['reviewText']), axis=1))
avg_words

## Apply SVM classifier to selected reviews

Establish train and test sets

In [397]:
dict_train = {}
dict_test = {}
for i in range(10):
  dict_train[i] = unstructured_reviews_df.sample(frac=0.75)
  dict_test[i] = pd.concat([unstructured_reviews_df, dict_train[i]]).drop_duplicates(subset='reviewText', keep=False)

Initialize model for multiple tests

In [398]:
model_eval = pd.DataFrame(columns=['Model', 'Macro f1', 'Macro precision', 'Macro recall', 'Weighted f1', 'Weighted precision', 'Weighted recall'])

for i in range(10):
  X_train = pd.DataFrame(dict_train[i]['embeddings'].to_list())
  X_test = pd.DataFrame(dict_test[i]['embeddings'].to_list())
  y_train = dict_train[i]['overall']
  y_train = [0 if i==1 else 1 for i in y_train]
  y_test = dict_test[i]['overall']
  y_test = [0 if i==1 else 1 for i in y_test]

  clf = make_pipeline(StandardScaler(), SVC(C=3, kernel='rbf', gamma='auto'))
  clf.fit(X_train, y_train)
  y_pred_train = clf.predict(X_train)
  y_pred_test = clf.predict(X_test)

  model_eval = model_eval.append({'Model': i,
                                    'Macro f1': f1_score(y_test, y_pred_test, average='macro'), 
                                    'Macro precision': precision_score(y_test, y_pred_test, average='macro'), 
                                    'Macro recall': recall_score(y_test, y_pred_test, average='macro'), 
                                    'Weighted f1': f1_score(y_test, y_pred_test, average='weighted'), 
                                    'Weighted precision': precision_score(y_test, y_pred_test, average='weighted'), 
                                    'Weighted recall': recall_score(y_test, y_pred_test, average='weighted')
                                    }, ignore_index=True)
  
model_eval

Unnamed: 0,Model,Macro f1,Macro precision,Macro recall,Weighted f1,Weighted precision,Weighted recall
0,0.0,0.747243,0.747168,0.747471,0.747552,0.747782,0.747475
1,1.0,0.778203,0.782742,0.777644,0.779002,0.781958,0.78
2,2.0,0.797463,0.797241,0.797849,0.798083,0.798349,0.79798
3,3.0,0.758642,0.758642,0.758642,0.758794,0.758794,0.758794
4,4.0,0.743485,0.743578,0.74343,0.74368,0.743678,0.743719
5,5.0,0.8012,0.8032,0.800246,0.802549,0.803101,0.80303
6,6.0,0.788939,0.789569,0.789745,0.788977,0.790444,0.788945
7,7.0,0.774994,0.775548,0.77541,0.774972,0.775913,0.775
8,8.0,0.778217,0.778238,0.780734,0.779387,0.782396,0.778894
9,9.0,0.778755,0.778975,0.778693,0.778839,0.77894,0.778894


In [399]:
model_eval.describe()

Unnamed: 0,Model,Macro f1,Macro precision,Macro recall,Weighted f1,Weighted precision,Weighted recall
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.774714,0.77549,0.774986,0.775183,0.776136,0.775273
std,3.02765,0.01958,0.020044,0.019586,0.019815,0.020178,0.019893
min,0.0,0.743485,0.743578,0.74343,0.74368,0.743678,0.743719
25%,2.25,0.76273,0.762868,0.762834,0.762838,0.763074,0.762845
50%,4.5,0.77821,0.778606,0.778168,0.77892,0.780449,0.778894
75%,6.75,0.786393,0.787863,0.787492,0.786579,0.788432,0.786709
max,9.0,0.8012,0.8032,0.800246,0.802549,0.803101,0.80303


# Model analysis with bag-of-words

Applying CountVectorizer() to get features vectorization

In [402]:
docs = unstructured_reviews_df['reviewText'].to_list()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)
vectorizer.get_feature_names_out()

vectors = X.toarray()

feature_names = ['w_{}'.format(ii) for ii in range(len(vectors[0]))]
bow_node_data = pd.DataFrame(vectors, columns=feature_names)

# bow_node_data[['reviewText', 'overall']] = unstructured_reviews_df[['reviewText', 'overall']]

display(bow_node_data.head())

Unnamed: 0,w_0,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,...,w_2529,w_2530,w_2531,w_2532,w_2533,w_2534,w_2535,w_2536,w_2537,w_2538
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##Apply SVM classifier to selected reviews

Initialize model for multiple tests

In [406]:
model_eval = pd.DataFrame(columns=['Model', 'Macro f1', 'Macro precision', 'Macro recall', 'Weighted f1', 'Weighted precision', 'Weighted recall'])

for i in range(10):

  X_train = bow_node_data.iloc[dict_train[i].index]
  X_test = bow_node_data.iloc[dict_test[i].index]
  y_train = dict_train[i]['overall']
  y_train = [0 if i==1 else 1 for i in y_train]
  y_test = dict_test[i]['overall']
  y_test = [0 if i==1 else 1 for i in y_test]

  clf = make_pipeline(StandardScaler(), SVC(C=3, kernel='rbf', gamma='auto'))
  clf.fit(X_train, y_train)
  y_pred_train = clf.predict(X_train)
  y_pred_test = clf.predict(X_test)

  model_eval = model_eval.append({'Model': i,
                                    'Macro f1': f1_score(y_test, y_pred_test, average='macro'), 
                                    'Macro precision': precision_score(y_test, y_pred_test, average='macro'), 
                                    'Macro recall': recall_score(y_test, y_pred_test, average='macro'), 
                                    'Weighted f1': f1_score(y_test, y_pred_test, average='weighted'), 
                                    'Weighted precision': precision_score(y_test, y_pred_test, average='weighted'), 
                                    'Weighted recall': recall_score(y_test, y_pred_test, average='weighted')
                                    }, ignore_index=True)
  
model_eval

Unnamed: 0,Model,Macro f1,Macro precision,Macro recall,Weighted f1,Weighted precision,Weighted recall
0,0.0,0.822866,0.823036,0.822739,0.823192,0.823193,0.823232
1,1.0,0.783784,0.799616,0.789263,0.783135,0.802693,0.785
2,2.0,0.817512,0.834758,0.824885,0.816841,0.840283,0.818182
3,3.0,0.843653,0.846177,0.843238,0.843889,0.845663,0.844221
4,4.0,0.788512,0.794328,0.79058,0.788272,0.795499,0.788945
5,5.0,0.808081,0.812141,0.812141,0.808081,0.816201,0.808081
6,6.0,0.803703,0.803846,0.8036,0.80398,0.803981,0.80402
7,7.0,0.76238,0.78223,0.767407,0.761881,0.783728,0.765
8,8.0,0.738542,0.743252,0.737003,0.742055,0.743491,0.743719
9,9.0,0.823836,0.827829,0.824914,0.823729,0.828414,0.824121


In [407]:
model_eval.describe()

Unnamed: 0,Model,Macro f1,Macro precision,Macro recall,Weighted f1,Weighted precision,Weighted recall
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.799287,0.806721,0.801577,0.799506,0.808315,0.800452
std,3.02765,0.03156,0.029673,0.031618,0.030952,0.030047,0.030234
min,0.0,0.738542,0.743252,0.737003,0.742055,0.743491,0.743719
25%,2.25,0.784966,0.79565,0.789592,0.784419,0.797297,0.785986
50%,4.5,0.805892,0.807994,0.807871,0.806031,0.810091,0.80605
75%,6.75,0.821528,0.826631,0.824348,0.821604,0.827109,0.82197
max,9.0,0.843653,0.846177,0.843238,0.843889,0.845663,0.844221
