## Gabriel's Preparation Code for TPU

In [14]:
# If run type is TPU, run this cell 
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

try:
  # imports the torch_xla package
  import torch_xla
  import torch_xla.core.xla_model as xm

  device = xm.xla_device()

except:
  VERSION = "20200325"  #@param ["1.5" , "20200325", "nightly"]
  !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
  !python pytorch-xla-env-setup.py --version $VERSION

  # imports pytorch
  import torch

  # imports the torch_xla package
  import torch_xla
  import torch_xla.core.xla_model as xm

  device = xm.xla_device()

In [15]:
!pip install transformers



In [16]:
import numpy as np
import pandas as pd
import re
import time
import datetime
from html.parser import HTMLParser
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, DataLoader
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Load and Cleaning the Data

In [17]:
df = pd.read_csv ('/content/drive/My Drive/ml/final_merged_data_raw.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,Topic Title,Category,Author,Leading Comment,Other Comments
0,About the Product Sourcing Category,Product Sourcing,Trent-Admin,Have questions about sourcing products? This i...,[]
1,Price Checker 2 - Competitor storefront extrac...,Product Sourcing,MoniqueAndKerry,Hi! We are new to the forum and are going thro...,['Yes you will need the paid version. Options...
2,Top 10 Tips to Grow your Business Rapidly,Product Sourcing,Pradimna_Kumar,As I am working in Amazon as a seller from las...,[]
3,Virtual Assistants,Product Sourcing,JimW_PB,"Does anyone have a VA they recommend, have use...",['@McDavid Most don’t want to share any stella...
4,Can you sell branded products on Amazon uk,Product Sourcing,Mitch,Can you sell branded products on Amazon Uk or ...,['You can sell on Amazon UK as well. It could...


In [19]:
def text_cleaning(text):
    # converting HTML character codes to ASCII code
    parser = HTMLParser()
    text = parser.unescape(text)

    text = re.sub(r'<[^>]+>', '', text)  # removing HTML tags
    text = re.sub(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)', '', text)  # removing hash-tags
    text = re.sub('\n', ' ', text)  # remove new line
    text = re.sub('@', '', text)  # remove @ sign
    #text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '',text)  # removing URLs
    text = re.sub(r'(?:[\ufffd]+)', '', text)  # removing special characters
    text = text.lower()

    return text

In [20]:
print(df.isna().sum())
print('')

nan_row = np.where(df.isna())[0][0]
print(df.iloc[nan_row,:])

Topic Title        0
Category           0
Author             0
Leading Comment    1
Other Comments     0
dtype: int64

Topic Title        Israel Cohen’s Success Story
Category                            Misc Topics
Author                              Trent-Admin
Leading Comment                             NaN
Other Comments                               []
Name: 245, dtype: object


In [21]:
df.dropna(axis=0,subset=['Leading Comment'],inplace = True)

In [22]:
comments = df['Leading Comment'].apply(lambda x:text_cleaning(x))

In [23]:
comments

0       have questions about sourcing products? this i...
1       hi! we are new to the forum and are going thro...
2       as i am working in amazon as a seller from las...
3       does anyone have a va they recommend, have use...
4       can you sell branded products on amazon uk or ...
                              ...                        
8834    ok, the big day–i passed the sfp trial period....
8835    does anyone  how to respond to a request from ...
8836    i’ve been trying to fix an issue concerning li...
8837    hello, i’m an e-commerce specialist, and i am ...
8838    when i enter contact seller support, my chrome...
Name: Leading Comment, Length: 8838, dtype: object

In [24]:
def format_time(elapsed):
    # display the elapsed time when loading the data into the BERT model
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [25]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

## Preprocessing before using BERT

> Tokenization + Padding + Masking



In [None]:
label_encoder = LabelEncoder()

# Tokenize all of the sentences and map the tokens to thier word IDs.
inputs = []
attention_masks = []

# For every comment
for comment in comments:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.

    encoded_dict = tokenizer.encode_plus(
                        comment,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    inputs.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors and load them to GPU/TPU
inputs = torch.cat(inputs, dim=0)#.to(device)
attention_masks = torch.cat(attention_masks, dim=0)#.to(device)

# Encoding the labels and convert them to tensor 
labels = label_encoder.fit_transform(df['Category'])
labels = torch.tensor(labels)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer y

In [27]:
batch_size = 16

dataset = TensorDataset(inputs,attention_masks)
dataloader = DataLoader(dataset,batch_size = batch_size)

time_start = time.time()

features_list = []

for step, batch in enumerate(dataloader):
  if step % 10 == 0 and not step == 0:
    elapsed = format_time(time.time()-time_start)
    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader), elapsed))
  b_inpus = batch[0].to(device)
  b_attention_masks = batch[1].to(device)

  with torch.no_grad():
    last_hidden_states = model(b_inpus,attention_mask=b_attention_masks)

  last_hidden_states_cpu = last_hidden_states[0][:,0,:].cpu().numpy()
  features_list.append(last_hidden_states_cpu)

  Batch    10  of    553.    Elapsed: 0:00:04.
  Batch    20  of    553.    Elapsed: 0:00:05.
  Batch    30  of    553.    Elapsed: 0:00:06.
  Batch    40  of    553.    Elapsed: 0:00:07.
  Batch    50  of    553.    Elapsed: 0:00:08.
  Batch    60  of    553.    Elapsed: 0:00:08.
  Batch    70  of    553.    Elapsed: 0:00:09.
  Batch    80  of    553.    Elapsed: 0:00:10.
  Batch    90  of    553.    Elapsed: 0:00:11.
  Batch   100  of    553.    Elapsed: 0:00:11.
  Batch   110  of    553.    Elapsed: 0:00:12.
  Batch   120  of    553.    Elapsed: 0:00:13.
  Batch   130  of    553.    Elapsed: 0:00:14.
  Batch   140  of    553.    Elapsed: 0:00:15.
  Batch   150  of    553.    Elapsed: 0:00:15.
  Batch   160  of    553.    Elapsed: 0:00:16.
  Batch   170  of    553.    Elapsed: 0:00:17.
  Batch   180  of    553.    Elapsed: 0:00:18.
  Batch   190  of    553.    Elapsed: 0:00:18.
  Batch   200  of    553.    Elapsed: 0:00:19.
  Batch   210  of    553.    Elapsed: 0:00:20.
  Batch   220

In [28]:
all_features = []
for batch in features_list:
  for feature in batch:
    all_features.append(feature)
len(all_features)

8838

## Calculate Similarity

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(feature_vec_1, feature_vec_2):
  return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1)) [0][0]

In [30]:
#similarity between two posts in the amazon forum
get_cosine_similarity(all_features[0], all_features[2])

0.8823761

In [31]:
#similarity between two posts in different forum
get_cosine_similarity(all_features[0], all_features[8600])

0.8939365

In [32]:
#expected to see high similarity
get_cosine_similarity(all_features[8657], all_features[8658])

0.83528537

In [33]:
def most_similar_post(index):
  #prints out and return the the max similarity score
  #the post with the passed in index is able to achieve with another post
  #and that other post's index
  similarity = {}
  max = 0;
  for i in range(len(all_features)):
    if i != index:
      if get_cosine_similarity(all_features[index], all_features[i]) > max:
        max_i = i
        max = get_cosine_similarity(all_features[index], all_features[i])
      similarity[i]=get_cosine_similarity(all_features[index], all_features[i])
  return max, max_i

In [38]:
def most_similar_post_in_another_forum(test_forum):
  if test_forum == "amazon":
    for i in range(8679):
      max, max_i = most_similar_post(i)
      if max_i > 8578:
        return "Amazon: T"
    return "Amazon: F"
  else:
    for i in range(8679, len(all_features)):
      max, max_i = most_similar_post(i)
      if max_i > 8578:
        return "Flowster: T"
    return "Flowster: F"
most_similar_post(1)

(0.9812825, 5193)

In [43]:
#result says some posts in the Amazon forum is most similar to post in the Flowster forum instead of its own forum
most_similar_post_in_another_forum("amazon")
most_similar_post_in_another_forum("flowster")

'Flowster: T'

## Function that Prints out Similar Topics with Provided Index:

In [39]:
cosine_sim = cosine_similarity(all_features, all_features)
def recommend(idx, cosine_sim = cosine_sim):
    recommended_topics = []
    #idx = indices[indices == title].index[0]   # to get the index of the topic matching the input topic
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar topics
    # [1:11] to exclude 0 (index 0 is the input movie itself)
    
    for i in top_10_indices:   # to append the titles of top 10 similar topics to the recommended_topics list
        recommended_topics.append(list(df['Topic Title'])[i])
        
    return recommended_topics

In [41]:
recommend(0)

['About the Sales Channels & Marketplaces Category',
 'About the Financial Management category',
 'About the Management category',
 'About the Human Resources category',
 'About the Misc Topics category',
 'About the Software & Tools category',
 'About the Fulfillment category',
 'About the Traffic Sources category',
 'About the Amazon Specific Category',
 'Shipment fulfillment']

In [42]:
recommend(1)

['ShipWorks',
 'Can you launch LWA webpage inside the app, and not in Safari on iOS?',
 'Third Party Developer Apps',
 'Software/Service for Ratings/Review Report',
 'Has anyone used SageMailer?',
 'How do I make simple API calls (Python)?',
 'Merging 2 Amazon Accounts',
 'Help Needed With Accounting Match',
 'Multi-Channel and SHOPIFY',
 'Integration Link']