In [None]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create the full runtime project path and create a workspace at that location
WORKING_PATH = join(ROOT, PROJECT_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/82/25/89050e69ed53c2a3b7f8c67844b3c8339c1192612ba89a172cf85b298948/transformers-3.0.1-py3-none-any.whl (757kB)
[K     |████████████████████████████████| 757kB 8.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.2MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 40.4MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
import math

import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping

import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split, RandomSampler

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  import pandas.util.testing as tm


There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
# Load merged data for Amazon and Flowster forums
df = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/final_merged_data_augmented.csv', dtype=str)

# Create new column that combines other columns of interest into text sequences
df['Combined Sequence'] = df['Leading Comment'] + ' ' + df['Reply Comments']

# Super complicated string processing to combine reply comments properly
#df['Reply Comments'] = df['Reply Comments'].apply(lambda x : ' '.join(x.split("', '")).replace("'", "’").strip('[]’'))

# Create new column that also includes everything
df['Extended Combined Sequence'] =  df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Post Author'] + ' ' + df['Reply Comments']

# Extract Combined Sequence and Category columns as sample data and labels
filteredDF = df[['Combined Sequence', 'Extended Combined Sequence', 'Category']]

# Drop NaN rows
filteredDF = filteredDF.dropna()

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#for index, row in filteredDF.iterrows():
    #print(filteredDF.loc[index, 'Combined Sequence'])
    #print(tokenizer.tokenize(filteredDF.loc[index, 'Combined Sequence']))
    #print(len(tokenizer.encode(filteredDF.loc[10, 'Combined Sequence'])))


inputIDs = []
attentionMasks = []

# Generate encodings and attention masks for every equence
for index, row in filteredDF.iterrows():
    sequence = row['Combined Sequence']

    encodedDict = tokenizer.encode_plus(
        sequence,                       # Sentence to encode
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        truncation=True,
        max_length = 512,               # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attention masks
        return_tensors = 'pt',          # Return PyTorch tensors
        )
    
    # Add the encoded sentence to the list
    inputIDs.append(encodedDict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding)
    attentionMasks.append(encodedDict['attention_mask'])

# Concatenate the lists into PyTorch tensors
inputIDs = torch.cat(inputIDs, dim=0)
attentionMasks = torch.cat(attentionMasks, dim=0)

# Encode category text into numerical labels
labelEncoder = LabelEncoder()
labels = labelEncoder.fit_transform(filteredDF['Category'])
labels = torch.tensor(labels)

print(inputIDs)
print(attentionMasks)
print(labels)

torch.save(inputIDs, '/content/drive/My Drive/Github/inputIDs.pt')
torch.save(attentionMasks, '/content/drive/My Drive/Github/attentionMasks.pt')
torch.save(labels, '/content/drive/My Drive/Github/labels.pt')



(11273, 3)
Selling on Amazon                                     1840
Account Health                                        1495
Fulfillment By Amazon                                 1329
Global Selling                                         559
Groups                                                 491
Amazon Pay                                             447
Amazon Sponsored Products                              295
Amazon Marketplace Web Service (MWS)                   294
Site Feedback                                          275
US Announcements                                       259
Amazon Custom                                          219
Health,Safety,Sustainability,Security & Compliance     212
Amazon Specific                                        200
Human Resources                                        200
Product Sourcing                                       192
Login With Amazon                                      187
Software & Tools                             

KeyboardInterrupt: ignored

In [None]:
inputIDs = torch.load('/content/drive/My Drive/Github/inputIDs.pt')
attentionMasks = torch.load('/content/drive/My Drive/Github/attentionMasks.pt')
labels = torch.load('/content/drive/My Drive/Github/labels.pt')

# Combine the training inputs into a TensorDataset
#dataset = TensorDataset(inputIDs, attentionMasks, labels)
dataset = TensorDataset(inputIDs, labels)

# Create a 90-10 train-test split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
trainDataset, valDataset = random_split(dataset, [train_size, val_size])

batchSize = 32

numBatches = math.ceil(inputIDs.shape[0]/batchSize)

outputBatches = []

trainDataloader = DataLoader(
    trainDataset,
    sampler = RandomSampler(dataset),  #Select batches randomly
    batch_size = batchSize
    )

# For validation the order doesn't matter, so we'll just read them sequentially.
validationDataloader = DataLoader(
    valDataset,
    batch_size = batchSize
    )

torch.Size([9687, 512])


In [None]:
inputIDs = torch.load('/content/drive/My Drive/Github/inputIDs.pt')
attentionMasks = torch.load('/content/drive/My Drive/Github/attentionMasks.pt')
labels = torch.load('/content/drive/My Drive/Github/labels.pt')
#[[0,1, -1,-2], :]

class EncodedDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids':self.input_ids[idx], 'attention_mask':self.attention_mask[idx], 'labels':self.labels[idx]}

dataset = EncodedDataset(inputIDs, attentionMasks, labels)

# Create a 90-10 train-test split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
trainDataset, valDataset = random_split(dataset, [train_size, val_size])

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 24, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    )

trainingArgs = TrainingArguments(
    output_dir='/content/drive/My Drive/Github',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/My Drive/Github/logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=trainingArgs,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=valDataset           # evaluation dataset
)

trainer.train()

output_dir = '/content/drive/My Drive/Github/BERTmodel/'

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)

'''
i = 1
for batch in dataloader:
    print('Batch ', i, ' of ', numBatches)
    i += 1

    inputIDsBatch = batch[0].cuda()
    attentionMasksBatch = batch[1].cuda()
    
    with torch.no_grad():
        finalHiddenStates = model(inputIDsBatch, attention_mask=attentionMasksBatch)
    
    #output = finalHiddenStates[0][:,0,:].cpu()
    outputBatches.append(finalHiddenStates[0][:,0,:].cpu())

finalHiddenStates = torch.cat(outputBatches)

features = finalHiddenStates.cpu().numpy()

print(features.shape)
print(features)
'''

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=545.0, style=ProgressStyle(description_wi…






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=545.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=545.0, style=ProgressStyle(description_wi…



Saving model to /content/drive/My Drive/Github/BERTmodel/


"\ni = 1\nfor batch in dataloader:\n    print('Batch ', i, ' of ', numBatches)\n    i += 1\n\n    inputIDsBatch = batch[0].cuda()\n    attentionMasksBatch = batch[1].cuda()\n    \n    with torch.no_grad():\n        finalHiddenStates = model(inputIDsBatch, attention_mask=attentionMasksBatch)\n    \n    #output = finalHiddenStates[0][:,0,:].cpu()\n    outputBatches.append(finalHiddenStates[0][:,0,:].cpu())\n\nfinalHiddenStates = torch.cat(outputBatches)\n\nfeatures = finalHiddenStates.cpu().numpy()\n\nprint(features.shape)\nprint(features)\n"

In [None]:
#model.from_pretrained('/content/drive/My Drive/Github/pytorch_model')
model = BertForSequenceClassification.from_pretrained(
    '/content/drive/My Drive/Github/BERTmodel/', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 24, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    )

trainingArgs = TrainingArguments(
    output_dir='/content/drive/My Drive/Github',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/My Drive/Github/logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=trainingArgs,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=valDataset           # evaluation dataset
)

trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=31.0, style=ProgressStyle(description_wi…




{'eval_loss': 0.6815143515986781}

In [None]:
# Borrowing Gabriel's validation code since I was too tired to write my own :)

import numpy as np

validationDataloader = DataLoader(
    valDataset,
    batch_size = 16
    )

# switch model to evaluation mode
model.eval()

# Tracking variables 
pred_labels = []
true_labels = []

for idx, batch in enumerate(validationDataloader):
    #print(idx)

    b_inputs = batch['input_ids'].to(device)
    b_attention_masks = batch['attention_mask'].to(device)
    b_label = batch['labels'].to(device) 
    
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_inputs,
                      attention_mask=b_attention_masks)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    labels = b_label.to('cpu').numpy()
    
    # Store predictions and true labels
    pred_labels.append(logits)
    true_labels.append(labels)

flat_pred_labels = [item for sublist in pred_labels for item in sublist]
flat_pred_labels = np.argmax(flat_pred_labels, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

print(classification_report(flat_true_labels, flat_pred_labels))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       134
           1       0.80      0.64      0.71        25
           2       0.83      0.75      0.79        32
           3       0.80      0.76      0.78        49
           4       0.89      1.00      0.94        25
           5       0.84      0.87      0.86        31
           6       1.00      1.00      1.00        14
           7       1.00      1.00      1.00        17
           8       0.95      1.00      0.98        20
           9       0.77      0.81      0.79       142
          10       0.64      0.71      0.67        55
          11       0.93      0.74      0.82        53
          12       0.92      0.92      0.92        24
          13       1.00      1.00      1.00        15
          14       0.79      0.69      0.73        16
          15       1.00      1.00      1.00        15
          16       1.00      1.00      1.00         9
          17       1.00    

In [None]:
labels = filteredDF.loc[0:features.shape[0], 'Category']

X_train, X_test, y_train, y_test = train_test_split(features, labels)

logisticClassifier = LogisticRegression()

'''
# Train classifier and compute validation accuracy for each fold
CV = 5
#cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []

model_name = logisticClassifier.__class__.__name__
accuracies = cross_val_score(logisticClassifier, X_train, y_train, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#Caculating the mean of all models
print(cv_df.groupby('model_name').accuracy.mean())

seaborn.boxplot(x='model_name', y='accuracy', data=cv_df)
seaborn.stripplot(x='model_name', y='accuracy', data=cv_df, 
                size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
'''

# Perform final trainining on the full training set
logisticClassifier.fit(X_train, y_train)

# Perform final test set prediction and generate classification report
y_predicted = logisticClassifier.predict(X_test)

for i in set(y_test):
    print(i)

print()
for i in set(y_predicted):
    print(i)

print()
print('Classification Report')
print(classification_report(y_test, y_predicted))


In [None]:
'''
@brief      Determine if a string consists only of ASCII characters
@param      s           Input string
@return     boolean    
'''
def is_ascii(s):
    return all(ord(c) < 128 for c in s)
    

'''
@brief      Performs pre-processing on scraped web data
@param      topicDict       Dictionary of topic attributes
@return     topicFeatures   List of pre-processed strings that represent each topic
@return     labels          List of each topic's ground truth category
'''
def cleanData(topicDict):
    # Get list of topics
    topics = list(topicDict.keys())

    # Create empty lists to store outputs
    topicFeatures = []
    labels = []

    count = 1
    for topic in topics:
        # Hardcoded line to omit category written in Chinese
        category = topicDict[topic]['Category']
        if (is_ascii(category) == False or category == 'Store & Website Management'):
            continue

        # Combine topic title and comments into one string
        #title = topicDict[topic]['Topic Title']
        leadingComment = topicDict[topic]['Leading Comment']
        #otherComments = topicDict[topic]['Other Comments']
        
        featureList = [leadingComment] 
        featureString = ' '.join(featureList)

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)
        if (featureString.strip() == ''):
            continue

        # Print sample number and featureString
        #print(count)
        #count += 1
        #print(featureString)

        # Append featureString and the topic category to the output lists
        topicFeatures.append(featureString)
        labels.append(topicDict[topic]['Category'])

    return topicFeatures, labels



#if __name__ == '__main__':