In [1]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create the full runtime project path and create a workspace at that location
WORKING_PATH = join(ROOT, PROJECT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [12]:
import math

import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from transformers import DistilBertTokenizer, DistilBertModel

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping

import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


  import pandas.util.testing as tm


In [4]:
'''
@brief      Determine if a string consists only of ASCII characters
@param      s           Input string
@return     boolean    
'''
def is_ascii(s):
    return all(ord(c) < 128 for c in s)
    

'''
@brief      Performs pre-processing on scraped web data
@param      topicDict       Dictionary of topic attributes
@return     topicFeatures   List of pre-processed strings that represent each topic
@return     labels          List of each topic's ground truth category
'''
def cleanData(topicDict):
    # Get list of topics
    topics = list(topicDict.keys())

    # Create empty lists to store outputs
    topicFeatures = []
    labels = []

    count = 1
    for topic in topics:
        # Hardcoded line to omit category written in Chinese
        category = topicDict[topic]['Category']
        if (is_ascii(category) == False or category == 'Store & Website Management'):
            continue

        # Combine topic title and comments into one string
        #title = topicDict[topic]['Topic Title']
        leadingComment = topicDict[topic]['Leading Comment']
        #otherComments = topicDict[topic]['Other Comments']
        
        featureList = [leadingComment] 
        featureString = ' '.join(featureList)

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)
        if (featureString.strip() == ''):
            continue

        # Print sample number and featureString
        #print(count)
        #count += 1
        #print(featureString)

        # Append featureString and the topic category to the output lists
        topicFeatures.append(featureString)
        labels.append(topicDict[topic]['Category'])

    return topicFeatures, labels



if __name__ == '__main__':
    # Load merged data for Amazon and Flowster forums
    df = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/final_merged_data.csv')

    # Create new column that combines other columns of interest into text sequences
    df['Combined Sequence'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment']

    # Extract Combined Sequence and Category columns as sample data and labels
    filteredDF = df[['Combined Sequence', 'Category']]

    # Drop NaN rows and exclude last row since it contains a label with only one sample
    filteredDF = filteredDF[:-1].dropna()

    # Load pretrained tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    #print(filteredDF.loc[10, 'Combined Sequence'])
    #print(tokenizer.tokenize(filteredDF.loc[10, 'Combined Sequence']))
    #print(filteredDF.loc[:, 'Combined Sequence'].apply(tokenizer.encode))
    #print(len(tokenizer.encode(filteredDF.loc[10, 'Combined Sequence'])))

    # Tokenize all of the sentences and map the tokens to their word IDs.
    inputIDs = []
    attentionMasks = []

    # For every sentence...
    for index, row in filteredDF.iterrows():
        sequence = row['Combined Sequence']

        encodedDict = tokenizer.encode_plus(
            sequence,                       # Sentence to encode.
            add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
            max_length = 512,               # Pad & truncate all sentences.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',          # Return pytorch tensors.
            )
        
        # Add the encoded sentence to the list
        inputIDs.append(encodedDict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding)
        attentionMasks.append(encodedDict['attention_mask'])
    
    # Convert the lists into tensors.
    inputIDs = torch.cat(inputIDs, dim=0)
    attentionMasks = torch.cat(attentionMasks, dim=0)
    #labels = torch.tensor(labels)
    inputIDs = inputIDs.cuda()
    attentionMasks = attentionMasks.cuda()

    print(inputIDs.shape)


torch.Size([8839, 512])


In [5]:
inputIDs = inputIDs.cuda()
attentionMasks = attentionMasks.cuda()
model = model.cuda()

In [8]:
batchSize = 32
numBatches = math.ceil(inputIDs.shape[0]/batchSize)

outputBatches = []

for i in range(numBatches):
    print('Batch ', i, ' of ', numBatches)
    upperIndex = i + batchSize
    if (i == numBatches-1):
        break
        upperIndex = inputIDs.shape[0]
    
    with torch.no_grad():
        finalHiddenStates = model(inputIDs[i:upperIndex], attention_mask=attentionMasks[i:upperIndex])
    
    outputBatches.append(finalHiddenStates[0][:,0,:])

'''
with torch.no_grad():
    finalHiddenStates = model(inputIDs, attention_mask=attentionMasks)
'''

finalHiddenStates = torch.cat(outputBatches)

features = finalHiddenStates.cpu().numpy()

print(features.shape)
print(features)

Batch  0  of  277
Batch  1  of  277
Batch  2  of  277
Batch  3  of  277
Batch  4  of  277
Batch  5  of  277
Batch  6  of  277
Batch  7  of  277
Batch  8  of  277
Batch  9  of  277
Batch  10  of  277
Batch  11  of  277
Batch  12  of  277
Batch  13  of  277
Batch  14  of  277
Batch  15  of  277
Batch  16  of  277
Batch  17  of  277
Batch  18  of  277
Batch  19  of  277
Batch  20  of  277
Batch  21  of  277
Batch  22  of  277
Batch  23  of  277
Batch  24  of  277
Batch  25  of  277
Batch  26  of  277
Batch  27  of  277
Batch  28  of  277
Batch  29  of  277
Batch  30  of  277
Batch  31  of  277
Batch  32  of  277
Batch  33  of  277
Batch  34  of  277
Batch  35  of  277
Batch  36  of  277
Batch  37  of  277
Batch  38  of  277
Batch  39  of  277
Batch  40  of  277
Batch  41  of  277
Batch  42  of  277
Batch  43  of  277
Batch  44  of  277
Batch  45  of  277
Batch  46  of  277
Batch  47  of  277
Batch  48  of  277
Batch  49  of  277
Batch  50  of  277
Batch  51  of  277
Batch  52  of  277
Bat

In [17]:
labels = filteredDF.loc[0:features.shape[0], 'Category']

X_train, X_test, y_train, y_test = train_test_split(features, labels)

logisticClassifier = LogisticRegression()

'''
# Train classifier and compute validation accuracy for each fold
CV = 5
#cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []

model_name = logisticClassifier.__class__.__name__
accuracies = cross_val_score(logisticClassifier, X_train, y_train, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#Caculating the mean of all models
print(cv_df.groupby('model_name').accuracy.mean())

seaborn.boxplot(x='model_name', y='accuracy', data=cv_df)
seaborn.stripplot(x='model_name', y='accuracy', data=cv_df, 
                size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

# Perform final trainining on the full training set
logisticClassifier.fit(X_train, y_train)

# Perform final test set prediction and generate classification report
y_predicted = logisticClassifier.predict(X_test)

for i in set(y_test):
    print(i)

print()
for i in set(y_predicted):
    print(i)
'''
print()
print('Classification Report')
print(classification_report(y_test, y_predicted))



Classification Report
                                                    precision    recall  f1-score   support

                                    Account Health       0.21      0.19      0.20       395
                                     Amazon Custom       0.04      0.05      0.04        64
              Amazon Marketplace Web Service (MWS)       0.03      0.03      0.03        79
                                        Amazon Pay       0.08      0.07      0.07       152
                                   Amazon Specific       0.00      0.00      0.00        14
                         Amazon Sponsored Products       0.07      0.03      0.04        72
                              Financial Management       0.00      0.00      0.00         1
                                 Flowster-specific       0.00      0.00      0.00        13
                                       Fulfillment       0.00      0.00      0.00         4
                             Fulfillment By Amazon      

  _warn_prf(average, modifier, msg_start, len(result))
