In [None]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

# Create the full runtime project path and create a workspace at that location
WORKING_PATH = join(ROOT, PROJECT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers



In [None]:
import math

import pandas as pd
import matplotlib.pyplot as plt
import seaborn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  import pandas.util.testing as tm


No GPU available, using the CPU instead.


In [None]:
# Load merged data for Amazon and Flowster forums
df = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/final_merged_data.csv')

# Create new column that combines other columns of interest into text sequences
df['Combined Sequence'] = df['Leading Comment'] + ' ' + df['Reply Comments']

# Super complicated string processing to combine reply comments properly
df['Reply Comments'] = df['Reply Comments'].apply(lambda x : ' '.join(x.split("', '")).replace("'", "’").strip('[]’'))

# Create new column that also includes everything
df['Extended Combined Sequence'] =  df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Post Author'] + ' ' + df['Reply Comments']

# Extract Combined Sequence and Category columns as sample data and labels
filteredDF = df[['Combined Sequence', 'Extended Combined Sequence', 'Category']]

# Drop NaN rows
filteredDF = filteredDF.dropna()

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#for index, row in filteredDF.iterrows():
    #print(filteredDF.loc[index, 'Combined Sequence'])
    #print(tokenizer.tokenize(filteredDF.loc[index, 'Combined Sequence']))
    #print(len(tokenizer.encode(filteredDF.loc[10, 'Combined Sequence'])))


inputIDs = []
attentionMasks = []

# Generate encodings and attention masks for every equence
for index, row in filteredDF.iterrows():
    sequence = row['Combined Sequence']

    encodedDict = tokenizer.encode_plus(
        sequence,                       # Sentence to encode
        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
        truncation=True,
        max_length = 512,               # Pad & truncate all sentences
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attention masks
        return_tensors = 'pt',          # Return PyTorch tensors
        )
    
    # Add the encoded sentence to the list
    inputIDs.append(encodedDict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding)
    attentionMasks.append(encodedDict['attention_mask'])

# Concatenate the lists into PyTorch tensors
inputIDs = torch.cat(inputIDs, dim=0)
attentionMasks = torch.cat(attentionMasks, dim=0)

# Encode category text into numerical labels
labelEncoder = LabelEncoder()
labels = labelEncoder.fit_transform(filteredDF['Category'])
labels = torch.tensor(labels)

print(inputIDs)
print(attentionMasks)
print(labels)

# Combine the training inputs into a TensorDataset
#dataset = TensorDataset(inputIDs, attentionMasks, labels)
dataset = TensorDataset(inputIDs, labels)

# Create a 90-10 train-test split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
trainDataset, valDataset = random_split(dataset, [train_size, val_size])





tensor([[ 101, 7483, 1045,  ..., 9733, 2015,  102],
        [ 101, 1045, 2288,  ...,    0,    0,    0],
        [ 101, 1045, 2741,  ...,    0,    0,    0],
        ...,
        [ 101, 2031, 3980,  ...,    0,    0,    0],
        [ 101, 2065, 1045,  ...,    0,    0,    0],
        [ 101, 2031, 3980,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([ 9,  9,  9,  ..., 24, 24, 21])


In [None]:
batchSize = 32

numBatches = math.ceil(inputIDs.shape[0]/batchSize)

outputBatches = []

trainDataloader = DataLoader(
    trainDataset,
    sampler = RandomSampler(dataset),  #Select batches randomly
    batch_size = batchSize
    )

# For validation the order doesn't matter, so we'll just read them sequentially.
validationDataloader = DataLoader(
    valDataset,
    batch_size = batchSize
    )

In [None]:
class EncodedDataset(Dataset):
    def __init__(self, low, high):
        self.samples = list(range(low, high))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

trainingArgs = TrainingArguments(
    output_dir='/content/drive/My Drive/Github',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/My Drive/Github/logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=trainingArgs,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=valDataset           # evaluation dataset
)

trainer.train()
'''
i = 1
for batch in dataloader:
    print('Batch ', i, ' of ', numBatches)
    i += 1

    inputIDsBatch = batch[0].cuda()
    attentionMasksBatch = batch[1].cuda()
    
    with torch.no_grad():
        finalHiddenStates = model(inputIDsBatch, attention_mask=attentionMasksBatch)
    
    #output = finalHiddenStates[0][:,0,:].cpu()
    outputBatches.append(finalHiddenStates[0][:,0,:].cpu())

finalHiddenStates = torch.cat(outputBatches)

features = finalHiddenStates.cpu().numpy()

print(features.shape)
print(features)
'''

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=249.0, style=ProgressStyle(description_wi…

TypeError: ignored

In [None]:
labels = filteredDF.loc[0:features.shape[0], 'Category']

X_train, X_test, y_train, y_test = train_test_split(features, labels)

logisticClassifier = LogisticRegression()

'''
# Train classifier and compute validation accuracy for each fold
CV = 5
#cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []

model_name = logisticClassifier.__class__.__name__
accuracies = cross_val_score(logisticClassifier, X_train, y_train, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#Caculating the mean of all models
print(cv_df.groupby('model_name').accuracy.mean())

seaborn.boxplot(x='model_name', y='accuracy', data=cv_df)
seaborn.stripplot(x='model_name', y='accuracy', data=cv_df, 
                size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
'''

# Perform final trainining on the full training set
logisticClassifier.fit(X_train, y_train)

# Perform final test set prediction and generate classification report
y_predicted = logisticClassifier.predict(X_test)

for i in set(y_test):
    print(i)

print()
for i in set(y_predicted):
    print(i)

print()
print('Classification Report')
print(classification_report(y_test, y_predicted))


In [None]:
'''
@brief      Determine if a string consists only of ASCII characters
@param      s           Input string
@return     boolean    
'''
def is_ascii(s):
    return all(ord(c) < 128 for c in s)
    

'''
@brief      Performs pre-processing on scraped web data
@param      topicDict       Dictionary of topic attributes
@return     topicFeatures   List of pre-processed strings that represent each topic
@return     labels          List of each topic's ground truth category
'''
def cleanData(topicDict):
    # Get list of topics
    topics = list(topicDict.keys())

    # Create empty lists to store outputs
    topicFeatures = []
    labels = []

    count = 1
    for topic in topics:
        # Hardcoded line to omit category written in Chinese
        category = topicDict[topic]['Category']
        if (is_ascii(category) == False or category == 'Store & Website Management'):
            continue

        # Combine topic title and comments into one string
        #title = topicDict[topic]['Topic Title']
        leadingComment = topicDict[topic]['Leading Comment']
        #otherComments = topicDict[topic]['Other Comments']
        
        featureList = [leadingComment] 
        featureString = ' '.join(featureList)

        # Replace newline and tab characters with spaces
        featureString = featureString.replace('\n', ' ')
        featureString = featureString.replace('\t', ' ')

        # Convert all letters to lowercase
        featureString = featureString.lower()
        
        # Strip all punctuation
        #table = str.maketrans('', '', string.punctuation)
        #featureString = featureString.translate(table)

        # Remove all non-ASCII characters
        #featureString = featureString.encode(encoding='ascii', errors='ignore').decode('ascii')

        # Split feature string into a list to perform processing on each word
        wordList = featureString.split()

        # Remove all stop words
        stop_words = set(stopwords.words('english'))
        wordList = [word for word in wordList if not word in stop_words]

        # Remove all words to contain non-ASCII characters
        wordList = [word for word in wordList if is_ascii(word)]

        # Remove all leading/training punctuation, except for '$'
        punctuation = '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        wordList = [word.strip(punctuation) for word in wordList]

        # Replace all numbers with ######## identifier
        # Replace all costs with $$$$$$$$ identifier
        wordList = ['########' if (word.replace('.','').isdigit()) \
                    else '$$$$$$$$' if (word.replace('.','').replace('$','').isdigit()) \
                    else word \
                    for word in wordList]
        #wordList = ['########' if (word.replace('.','').isdigit()) else word for word in wordList]
        #wordList = ['########' if (word.translate(table).isdigit()) else word for word in wordList]

        # Reconstruct featureString
        # If it is empty, do not add this sample to the final output
        featureString = ' '.join(wordList)
        if (featureString.strip() == ''):
            continue

        # Print sample number and featureString
        #print(count)
        #count += 1
        #print(featureString)

        # Append featureString and the topic category to the output lists
        topicFeatures.append(featureString)
        labels.append(topicDict[topic]['Category'])

    return topicFeatures, labels



#if __name__ == '__main__':