### Mount Google Drive to runtime

In [1]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create the full runtime project path and create a workspace at that location
WORKING_PATH = join(ROOT, PROJECT_PATH)
!mkdir "{WORKING_PATH}" 
%cd "{WORKING_PATH}"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
mkdir: cannot create directory ‘/content/drive/My Drive/Github’: File exists
/content/drive/My Drive/Github


### Import libraries

In [6]:
import json
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
'''
@brief      Determine if a string consists only of ASCII characters
@param      s           Input string
@return     boolean    
'''
def is_ascii(s):
    return all(ord(c) < 128 for c in s)


'''
@brief      Performs pre-processing on scraped web data
@param      topicDict       Dictionary of topic attributes
@return     topicFeatures   List of pre-processed strings that represent each topic
@return     labels          List of each topic's ground truth category
'''
def cleanData(topicDict):
    # Get list of topics
    topics = list(topicDict.keys())

    # Create empty lists to store outputs
    topicFeatures = []
    labels = []

    count = 1
    for topic in topics:
        # Hardcoded line to omit category written in Chinese
        category = topicDict[topic]['Category']
        if (is_ascii(category) == False):
            continue

        # Combine topic title and comments into one string
        title = topicDict[topic]['Topic Title']
        leadingComment = topicDict[topic]['Leading Comment']
        #otherComments = topicDict[topic]['Other Comments']
        
        featureList = [title] + [leadingComment] 
        featureString = ' '.join(featureList)

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)
        if (featureString.strip() == ''):
            continue

        # Print sample number and featureString
        #print(count)
        #count += 1
        #print(featureString)

        # Append featureString and the topic category to the output lists
        topicFeatures.append(featureString)
        labels.append(topicDict[topic]['Category'])

    return topicFeatures, labels



if __name__ == '__main__':
    # Extract topic attribute dictionary from JSON file
    f = open('/content/drive/My Drive/Github/mlteam4/sandbox/Amazon_Topic_Attributes_20200617061621.json')
    topicDict = json.load(f)
    f.close()

    # Pre-process data
    topicFeatures, labels = cleanData(topicDict)

    # Initialize text vectorizer
    vectorizer = CountVectorizer()
    #vectorizer = TfidfVectorizer()

    # Vectorize topic texts
    X = vectorizer.fit_transform(topicFeatures)

    # It was found that if TF-IDF is used, accuracy can be improved by multiplying the vectors by a large number
    #X = 1000 * X

    #print(labels)

    # Split topic vectors and labels into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1)
    #X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1, random_state=8)

    # Printouts for debugging
    #print(X)
    #print(X.toarray())
    #print(X.shape)
    #print(y_train)
    #print(vectorizer.get_feature_names())

    # Create 5-Fold object to perform cross-validation split on the training set
    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(X_train)

    # Create multinominal Naive Bayes classifier
    multinominalClassifier = MultinomialNB()

    # Train classifier and compute validation accuracy for each fold
    n = 1
    for train_index, test_index in kf.split(X_train):
        print('Fold Number: ', n)
        n += 1

        # Further split X_train and y_train using fold indices
        X_train_train, X_train_test = \
            X_train[train_index], X_train[test_index]
        y_train_train, y_train_test = \
            [y_train[i] for i in train_index], [y_train[i] for i in test_index]

        # Train classifier
        multinominalClassifier.fit(X_train_train, y_train_train)

        # Calculate validation accuracy
        numCorrect = (multinominalClassifier.predict(X_train_test) == y_train_test).sum()
        accuracy = numCorrect / len(y_train_test)
        print('Multinominal Classifier Accuracy: ', accuracy)
    
    # Perform final trainining on the full training set
    multinominalClassifier.fit(X_train, y_train)

    # Perform final test set prediction and generate classification report
    y_predicted = multinominalClassifier.predict(X_test)

    print()
    print('Classification Report')
    print(classification_report(y_test, y_predicted, target_names=multinominalClassifier.classes_))
    

    # Create Gaussian Naive Bayes classifier
    gaussianClassifier = GaussianNB()

    # Train classifier and compute validation accuracy for each fold
    n = 1
    for train_index, test_index in kf.split(X_train):
        print('Fold Number: ', n)
        n += 1

        # Further split X_train and y_train using fold indices
        X_train_train, X_train_test = \
            X_train[train_index], X_train[test_index]
        y_train_train, y_train_test = \
            [y_train[i] for i in train_index], [y_train[i] for i in test_index]

        # Train classifier
        gaussianClassifier.fit(X_train_train.toarray(), y_train_train)

        # Calculate validation accuracy
        numCorrect = (gaussianClassifier.predict(X_train_test.toarray()) == y_train_test).sum()
        accuracy = numCorrect / len(y_train_test)
        print('Gaussian Classifier Accuracy: ', accuracy)
    
    # Perform final trainining on the full training set
    gaussianClassifier.fit(X_train.toarray(), y_train)

    # Perform final test set prediction and generate classification report
    y_predicted = gaussianClassifier.predict(X_test.toarray())

    print()
    print('Classification Report')
    print(classification_report(y_test, y_predicted, target_names=gaussianClassifier.classes_))



Fold Number:  1
Multinominal Classifier Accuracy:  0.6655701754385965
Fold Number:  2
Multinominal Classifier Accuracy:  0.6480263157894737
Fold Number:  3
Multinominal Classifier Accuracy:  0.6716008771929824
Fold Number:  4
Multinominal Classifier Accuracy:  0.6792763157894737
Fold Number:  5
Multinominal Classifier Accuracy:  0.6604498080087767

Classification Report
                                                    precision    recall  f1-score   support

                                    Account Health       0.65      0.84      0.73       170
                                     Amazon Custom       0.00      0.00      0.00         4
              Amazon Marketplace Web Service (MWS)       0.88      0.86      0.87       147
                                        Amazon Pay       0.78      0.84      0.81       173
                         Amazon Sponsored Products       0.83      0.33      0.48        15
                             Fulfillment By Amazon       0.62      0.56   

  _warn_prf(average, modifier, msg_start, len(result))


Gaussian Classifier Accuracy:  0.42214912280701755
Fold Number:  2
Gaussian Classifier Accuracy:  0.3980263157894737
Fold Number:  3
Gaussian Classifier Accuracy:  0.39473684210526316
Fold Number:  4
Gaussian Classifier Accuracy:  0.41502192982456143
Fold Number:  5
Gaussian Classifier Accuracy:  0.4355458036204059

Classification Report
                                                    precision    recall  f1-score   support

                                    Account Health       0.42      0.57      0.48       170
                                     Amazon Custom       0.00      0.00      0.00         4
              Amazon Marketplace Web Service (MWS)       0.81      0.56      0.67       147
                                        Amazon Pay       0.53      0.31      0.39       173
                         Amazon Sponsored Products       0.22      0.13      0.17        15
                             Fulfillment By Amazon       0.28      0.34      0.31        62
               

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
s = ['asdf', 'asdf', 'wrth']
print(s[[1, 2]])

TypeError: ignored