In [2]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create the full runtime project path and create a workspace at that location
WORKING_PATH = join(ROOT, PROJECT_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 2.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 16.3MB/s 
[?25hCollecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)


In [24]:
import math
import ast

import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from transformers import DistilBertTokenizer, DistilBertModel

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [21]:
# Load merged data for Amazon and Flowster forums
df = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/final_merged_data.csv')

# Create new column that combines other columns of interest into text sequences
df['Combined Sequence'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment']

# Extract Combined Sequence and Category columns as sample data and labels
filteredDF = df[['Combined Sequence', 'Category']]

# Drop NaN rows and exclude last row since it contains a label with only one sample
filteredDF = filteredDF[:-1].dropna()

print(filteredDF['Category'].value_counts())

Selling on Amazon                                     2100
Account Health                                        1549
Fulfillment By Amazon                                 1500
Global Selling                                         600
Amazon Pay                                             600
Groups                                                 494
Site Feedback                                          300
US Announcements                                       300
Amazon Marketplace Web Service (MWS)                   300
Amazon Sponsored Products                              300
Amazon Custom                                          274
Login With Amazon                                      199
Health,Safety,Sustainability,Security & Compliance      63
Flowster-specific                                       59
Product Sourcing                                        53
Amazon Specific                                         53
Human Resources                                         

In [23]:
# Load merged data for Amazon and Flowster forums
df = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/final_merged_data.csv')

# Create new column that combines other columns of interest into text sequences
df['Combined Sequence'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment']

# Super complicated string processing to combine reply comments properly
df['Reply Comments'] = df['Reply Comments'].apply(lambda x : ' '.join(x.split("', '")).replace("'", "’").strip('[]’'))

# Create new column that also includes reply comments
df['Extended Combined Sequence'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Reply Comments']

# Extract Combined Sequence and Category columns as sample data and labels
filteredDF = df[['Combined Sequence', 'Extended Combined Sequence', 'Category']]

# Drop NaN rows and exclude last row since it contains a label with only one sample
filteredDF = filteredDF[:-1].dropna()

# Load pretrained tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

#for index, row in filteredDF.iterrows():
    #print(filteredDF.loc[index, 'Combined Sequence'])
    #print(tokenizer.tokenize(filteredDF.loc[index, 'Combined Sequence']))
    #print(len(tokenizer.encode(filteredDF.loc[10, 'Combined Sequence'])))

# Tokenize all of the sentences and map the tokens to their word IDs
inputIDs = []
attentionMasks = []

categoryCounts = filteredDF['Category'].value_counts()

# For every sentence...
for index, row in filteredDF.iterrows():
    if (categoryCounts[row['Category']] > 100):
        sequence = row['Combined Sequence']
    else:
        sequence = row['Extended Combined Sequence']

    encodedDict = tokenizer.encode_plus(
        sequence,                       # Sentence to encode
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        truncation=True,
        max_length = 200,               # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attention masks
        return_tensors = 'pt',          # Return PyTorch tensors
        )
    
    # Add the encoded sentence to the list
    inputIDs.append(encodedDict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding)
    attentionMasks.append(encodedDict['attention_mask'])

# Convert the lists into tensors.
inputIDs = torch.cat(inputIDs, dim=0)
attentionMasks = torch.cat(attentionMasks, dim=0)

labelEncoder = LabelEncoder()
labels = labelEncoder.fit_transform(filteredDF['Category'])
labels = torch.tensor(labels)

print(inputIDs)
print(attentionMasks)
print(labels)

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(inputIDs, attentionMasks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])



tensor([[  101,  2358, 24163,  ...,     0,     0,     0],
        [  101,  5324,  6279,  ...,     0,     0,     0],
        [  101,  2911, 10258,  ...,     0,     0,     0],
        ...,
        [  101,  3854,  2102,  ...,     0,     0,     0],
        [  101,  7990,  1011,  ...,     0,     0,     0],
        [  101,  6874,  3573,  ...,  2583,  2061,   102]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])
tensor([ 9,  9,  9,  ..., 16, 23, 23])


In [None]:
inputIDs = inputIDs.cuda()
attentionMasks = attentionMasks.cuda()
model = model.cuda()

In [27]:
batchSize = 32
model = model.cuda()

numBatches = math.ceil(inputIDs.shape[0]/batchSize)

outputBatches = []

dataloader = DataLoader(
    dataset,
    sampler = RandomSampler(dataset),  #Select batches randomly
    batch_size = batchSize
    )

i = 1
for batch in dataloader:
    print('Batch ', i, ' of ', numBatches)
    i += 1

    inputIDsBatch = batch[0].cuda()
    attentionMasksBatch = batch[1].cuda()
    
    with torch.no_grad():
        finalHiddenStates = model(inputIDsBatch, attention_mask=attentionMasksBatch)
    
    outputBatches.append(finalHiddenStates[0][:,0,:])

'''
for i in range(numBatches):
    print('Batch ', i, ' of ', numBatches)
    upperIndex = i + batchSize
    if (i == numBatches-1):
        break
        upperIndex = inputIDs.shape[0]
    
    with torch.no_grad():
        finalHiddenStates = model(inputIDs[i:upperIndex], attention_mask=attentionMasks[i:upperIndex])
    
    outputBatches.append(finalHiddenStates[0][:,0,:])
'''

finalHiddenStates = torch.cat(outputBatches)

features = finalHiddenStates.cpu().numpy()

print(features.shape)
print(features)

Batch  1  of  277
Batch  2  of  277
Batch  3  of  277
Batch  4  of  277
Batch  5  of  277
Batch  6  of  277
Batch  7  of  277
Batch  8  of  277
Batch  9  of  277
Batch  10  of  277
Batch  11  of  277
Batch  12  of  277
Batch  13  of  277
Batch  14  of  277
Batch  15  of  277
Batch  16  of  277
Batch  17  of  277
Batch  18  of  277
Batch  19  of  277
Batch  20  of  277
Batch  21  of  277
Batch  22  of  277
Batch  23  of  277
Batch  24  of  277
Batch  25  of  277
Batch  26  of  277
Batch  27  of  277
Batch  28  of  277
Batch  29  of  277
Batch  30  of  277
Batch  31  of  277
Batch  32  of  277
Batch  33  of  277
Batch  34  of  277
Batch  35  of  277
Batch  36  of  277
Batch  37  of  277
Batch  38  of  277
Batch  39  of  277
Batch  40  of  277
Batch  41  of  277
Batch  42  of  277
Batch  43  of  277
Batch  44  of  277
Batch  45  of  277
Batch  46  of  277
Batch  47  of  277
Batch  48  of  277
Batch  49  of  277
Batch  50  of  277
Batch  51  of  277
Batch  52  of  277
Batch  53  of  277
Ba

In [29]:
labels = filteredDF.loc[0:features.shape[0], 'Category']

X_train, X_test, y_train, y_test = train_test_split(features, labels)

logisticClassifier = LogisticRegression()

'''
# Train classifier and compute validation accuracy for each fold
CV = 5
#cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []

model_name = logisticClassifier.__class__.__name__
accuracies = cross_val_score(logisticClassifier, X_train, y_train, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#Caculating the mean of all models
print(cv_df.groupby('model_name').accuracy.mean())

seaborn.boxplot(x='model_name', y='accuracy', data=cv_df)
seaborn.stripplot(x='model_name', y='accuracy', data=cv_df, 
                size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
'''

# Perform final trainining on the full training set
logisticClassifier.fit(X_train, y_train)

# Perform final test set prediction and generate classification report
y_predicted = logisticClassifier.predict(X_test)

for i in set(y_test):
    print(i)

print()
for i in set(y_predicted):
    print(i)

print()
print('Classification Report')
print(classification_report(y_test, y_predicted))


Financial Management
Account Health
Amazon Specific
Human Resources
Traffic Sources
Health,Safety,Sustainability,Security & Compliance
US Announcements
Login With Amazon
Product Sourcing
Flowster-specific
Fulfillment By Amazon
Software & Tools
Global Selling
Groups
Fulfillment
Management
Amazon Pay
Selling on Amazon
Amazon Sponsored Products
Amazon Custom
Amazon Marketplace Web Service (MWS)
Site Feedback
Misc Topics

US Announcements
Fulfillment By Amazon
Amazon Pay
Selling on Amazon
Amazon Sponsored Products
Amazon Custom
Account Health
Amazon Marketplace Web Service (MWS)
Site Feedback
Global Selling
Groups

Classification Report
                                                    precision    recall  f1-score   support

                                    Account Health       0.19      0.23      0.21       412
                                     Amazon Custom       0.00      0.00      0.00        73
              Amazon Marketplace Web Service (MWS)       0.00      0.00      0.00 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
'''
@brief      Determine if a string consists only of ASCII characters
@param      s           Input string
@return     boolean    
'''
def is_ascii(s):
    return all(ord(c) < 128 for c in s)
    

'''
@brief      Performs pre-processing on scraped web data
@param      topicDict       Dictionary of topic attributes
@return     topicFeatures   List of pre-processed strings that represent each topic
@return     labels          List of each topic's ground truth category
'''
def cleanData(topicDict):
    # Get list of topics
    topics = list(topicDict.keys())

    # Create empty lists to store outputs
    topicFeatures = []
    labels = []

    count = 1
    for topic in topics:
        # Hardcoded line to omit category written in Chinese
        category = topicDict[topic]['Category']
        if (is_ascii(category) == False or category == 'Store & Website Management'):
            continue

        # Combine topic title and comments into one string
        #title = topicDict[topic]['Topic Title']
        leadingComment = topicDict[topic]['Leading Comment']
        #otherComments = topicDict[topic]['Other Comments']
        
        featureList = [leadingComment] 
        featureString = ' '.join(featureList)

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)
        if (featureString.strip() == ''):
            continue

        # Print sample number and featureString
        #print(count)
        #count += 1
        #print(featureString)

        # Append featureString and the topic category to the output lists
        topicFeatures.append(featureString)
        labels.append(topicDict[topic]['Category'])

    return topicFeatures, labels



#if __name__ == '__main__':