# **Part0 : Dataset**

# **[Preprocess Dataset]**

In [None]:
import pandas as pd
import numpy as np
import csv
import sklearn

test = open("amazon_pet.tsv", encoding='UTF8')
read_tsv = csv.reader(test, delimiter = '\t')
df_temp = []
for row in read_tsv:
    if len(row) == 15:
        df_temp.append(row)
        
df = pd.DataFrame(df_temp[1:], columns = df_temp[0])

df['document'] = df[['review_headline', 'review_body']].agg(' '.join, axis=1)
df['document'] = df['document'].apply(lambda x: x.lower())
df = df[['document','star_rating']]

# Unbalanced Dataset
print(df['star_rating'].value_counts())

# Multiclass Dataset 
# Use 150000 data for each label. Total 750000.

df1 = df[df['star_rating']=='1']
df2 = df[df['star_rating']=='2']
df3 = df[df['star_rating']=='3']
df4 = df[df['star_rating']=='4']
df5 = df[df['star_rating']=='5']

df1_s = df1.sample(n=150000, random_state = 1)
df2_s = df2.sample(n=150000, random_state = 1)
df3_s = df3.sample(n=150000, random_state = 1)
df4_s = df4.sample(n=150000, random_state = 1)
df5_s = df5.sample(n=150000, random_state = 1)

df_final = df1_s.append(df2_s).append(df3_s).append(df4_s).append(df5_s)

import re
df_final['document'] = df_final['document'].apply(lambda x: ' '.join(re.split(r"\W+", x)))
df_final.to_csv('pet.csv', index = False)

# Binary Dataset
df_bad = df1.append(df2).append(df3)
df_good = df4.append(df5)

df_bad['star_rating'] = 0 
df_good['star_rating'] = 1

df_bs = df_bad.sample(n=500000, random_state = 1)
df_gs = df_good.sample(n=500000, random_state = 1)

new_df = df_bs.append(df_gs)

new_df['document'] = new_df['document'].apply(lambda x: ' '.join(re.split(r"\W+", x)))
new_df.to_csv('pet2.csv', index = False)

# **Part1A : ML Multiclass**


# **[Count Vector]**

In [4]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 64kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 48.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=686b422a013c1384aab11b9e4382a130caf2222a8ab804fbf2cbb51d61b7e4e6
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('csv').options(header='true', inferschema='true').load('pet.csv')
data.groupBy("star_rating").count().show()

+-----------+------+
|star_rating| count|
+-----------+------+
|          1|150000|
|          3|150000|
|          5|150000|
|          4|150000|
|          2|150000|
+-----------+------+



In [10]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

regexTokenizer = RegexTokenizer(inputCol="document", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(sw)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "star_rating", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+--------------------+-----------+--------------------+--------------------+--------------------+-----+
|            document|star_rating|               words|            filtered|            features|label|
+--------------------+-----------+--------------------+--------------------+--------------------+-----+
|your cat will go ...|          1|[your, cat, will,...|[cat, go, hunter,...|(10000,[3,7,12,13...|  0.0|
|two stars stopped...|          1|[two, stars, stop...|[two, stars, stop...|(10000,[14,17,27,...|  0.0|
| worked for a tot...|          1|[worked, for, a, ...|[worked, total, t...|(10000,[1,16,17,5...|  0.0|
|not the best qual...|          1|[not, the, best, ...|[best, quality, p...|(10000,[0,1,50,51...|  0.0|
|one star cats hat...|          1|[one, star, cats,...|[one, star, cats,...|(10000,[2,5,12,38...|  0.0|
+--------------------+-----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



0.5790502527468073

In [12]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0| 3848|
|  1.0|       1.0|20856|
|  3.0|       2.0| 6501|
|  4.0|       2.0| 1595|
|  0.0|       1.0| 8531|
|  0.0|       4.0| 1428|
|  1.0|       0.0|10768|
|  2.0|       2.0|20981|
|  3.0|       1.0| 2723|
|  2.0|       3.0| 7613|
|  1.0|       4.0| 2230|
|  4.0|       4.0|35124|
|  2.0|       4.0| 3609|
|  3.0|       4.0|11040|
|  2.0|       1.0| 8931|
|  1.0|       2.0| 8374|
|  0.0|       0.0|31625|
|  1.0|       3.0| 2899|
|  4.0|       3.0| 6330|
|  0.0|       2.0| 2473|
+-----+----------+-----+
only showing top 20 rows



# **[TF-IDF]**

In [13]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+-----+----------+
|                   probability|label|prediction|
+------------------------------+-----+----------+
|[0.9999999985436254,1.44708...|  0.0|       0.0|
|[0.9999997422210548,1.85806...|  0.0|       0.0|
|[0.9999990732413198,9.23504...|  0.0|       0.0|
|[0.9999958147123547,4.08301...|  0.0|       0.0|
|[0.999990144378054,9.680507...|  0.0|       0.0|
|[0.9999839093117626,4.11381...|  0.0|       0.0|
|[0.9999813933323186,1.82980...|  0.0|       0.0|
|[0.9999563452526603,1.91689...|  0.0|       0.0|
|[0.9999550744570272,4.40600...|  0.0|       0.0|
|[0.9999214457712443,4.05804...|  0.0|       0.0|
+------------------------------+-----+----------+
only showing top 10 rows



0.5668025763990974

In [14]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0| 4099|
|  1.0|       1.0|20154|
|  3.0|       2.0| 6485|
|  4.0|       2.0| 1754|
|  0.0|       1.0| 8320|
|  0.0|       4.0| 1797|
|  1.0|       0.0|11008|
|  2.0|       2.0|20525|
|  3.0|       1.0| 2865|
|  2.0|       3.0| 7380|
|  1.0|       4.0| 2858|
|  4.0|       4.0|34787|
|  2.0|       4.0| 4141|
|  3.0|       4.0|11409|
|  2.0|       1.0| 8837|
|  1.0|       2.0| 8180|
|  0.0|       0.0|31281|
|  1.0|       3.0| 2927|
|  4.0|       3.0| 6116|
|  0.0|       2.0| 2570|
+-----+----------+-----+
only showing top 20 rows



# **[Naive Bayes]**

In [15]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+-----+----------+
|                   probability|label|prediction|
+------------------------------+-----+----------+
|[1.0,1.1006000566979454E-16...|  0.0|       0.0|
|[1.0,1.0904563018103447E-16...|  0.0|       0.0|
|[1.0,1.0824173581298883E-16...|  0.0|       0.0|
|[1.0,1.0765016333084843E-16...|  0.0|       0.0|
|[1.0,1.06385762059194E-16,9...|  0.0|       0.0|
|[1.0,1.0607994902226649E-16...|  0.0|       0.0|
|[1.0,1.0559923722345248E-16...|  0.0|       0.0|
|[1.0,1.0552316304675009E-16...|  0.0|       0.0|
|[1.0,1.0548079285995357E-16...|  2.0|       0.0|
|[1.0,1.0408637469940159E-16...|  0.0|       0.0|
+------------------------------+-----+----------+
only showing top 10 rows



0.5260304663587719

In [16]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0| 4307|
|  1.0|       1.0|17953|
|  3.0|       2.0| 7424|
|  4.0|       2.0| 2089|
|  0.0|       1.0| 9905|
|  0.0|       4.0| 2136|
|  1.0|       0.0| 9456|
|  2.0|       2.0|20096|
|  3.0|       1.0| 2852|
|  2.0|       3.0| 7568|
|  1.0|       4.0| 4072|
|  4.0|       4.0|32850|
|  2.0|       4.0| 5140|
|  3.0|       4.0|11717|
|  2.0|       1.0| 7871|
|  1.0|       2.0| 9873|
|  0.0|       0.0|27954|
|  1.0|       3.0| 3773|
|  4.0|       3.0| 7223|
|  0.0|       2.0| 3429|
+-----+----------+-----+
only showing top 20 rows



# **Part1B : ML Binary**

# **[Count Vector]**

In [None]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('csv').options(header='true', inferschema='true').load('pet2.csv')
data.groupBy("star_rating").count().show()

In [None]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

regexTokenizer = RegexTokenizer(inputCol="document", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(sw)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "star_rating", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

In [None]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

# **[TF-IDF]**

In [None]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

In [None]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

# **[Naive Bayes]**

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

In [None]:
predictions.select("label","prediction") \
    .groupBy("label","prediction") \
    .count() \
    .show()

# **Part2 : DeepLearning**

# **[BERT Binary]**

In [None]:
import torch

In [None]:
if torch.cuda.is_available():   
   device = torch.device("cuda")
   print('Using GPU ', torch.cuda.get_device_name(0))
else:
   device = torch.device("cpu")
   print('Using CPU')


Using GPU  Tesla V100-SXM2-16GB


In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, AdamW, BertForSequenceClassification
 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2, return_dict=True)

In [None]:
model.cuda()

RuntimeError: ignored

In [None]:
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support

In [None]:
import pandas as pd
df_pet = pd.read_csv('pet2.csv')

In [None]:
print(df_pet)

                                                 document  star_rating
0       your cat will go from being the hunter to the ...            1
1                    two stars stopped working in 2 weeks            1
2        worked for a total of two weeks what a waste ...            1
3       not the best quality pros very easy to assembl...            1
4       one star cats hated it it was noisy they did n...            1
...                                                   ...          ...
749995  quick and easy to use i got this based on all ...            5
749996  perfect i bought the large for my pug and fren...            5
749997  perfect well made and just what i needed perfe...            5
749998  cats meow great toy that the cat loves playing...            5
749999  perfect for a small dog i have a 5lb maltese a...            5

[750000 rows x 2 columns]


In [None]:
from sklearn.model_selection import train_test_split
#Use 25% of Dataset
df_dataset, df_75 = train_test_split(df_pet, test_size=0.75, shuffle=True)

#Train/Test
df_train, df_test = train_test_split(df_dataset, test_size=0.25, shuffle=True)

In [None]:
# Training - print F1 score every epoch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split

softmax_fn = torch.nn.Softmax(dim=1) # this returns a function that does softmax
from transformers import AdamW
from torch.nn import functional as F

epochs = 4
early_stop_epochs = 4
k = 1
each_len = len(df_train)//5
x_df = list(df_train['document'])
y_df = list(df_train['star_rating'])

# Test Set
x_test = list(df_test['document'])
y_test = list(df_test['star_rating'])

sentences = x_test
labels = y_test

input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
batch_size = 32  
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


# Main loop
lr = 0.00001
for i in range(k):
    print("k : " , i)
    x_test = x_df[each_len*i:each_len*(i+1)]
    x_train = x_df[:each_len*i]+x_df[each_len*(i+1):]
    y_test = y_df[each_len*i:each_len*(i+1)]
    y_train = y_df[:each_len*i]+y_df[each_len*(i+1):]

    # DataLoader
    batch_size = 32
    train_dataset = [[x_train[i], y_train[i]] for i in range(len(x_train))]
    test_dataset = [[x_test[i], y_test[i]] for i in range(len(x_test))]
    train_loader = DataLoader(dataset=train_dataset,sampler = RandomSampler(train_dataset), batch_size=batch_size)
    test_loader = DataLoader(dataset=test_dataset,sampler = SequentialSampler(test_dataset), batch_size=batch_size)

    # Model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2, return_dict=True)
    model.cuda()

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=lr)

    #scheduler
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

    nondecreasing = 0
    prev_loss = 9999999
    for epoch in range(epochs):  
        print("epoch : " , epoch)   
        total_loss = 0
        model.train()
        for [batch_x, batch_y] in train_loader:
            encoding = tokenizer(list(batch_x), return_tensors='pt',max_length = 64, padding=True, truncation=True)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            labels = torch.tensor(batch_y).to(device)
            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            softmax_fn = torch.nn.Softmax(dim=1)
            softmaxed_output = softmax_fn(outputs.logits)
            loss = F.cross_entropy(softmaxed_output, labels)
            total_loss += loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_loader)
        print("  avg_train_loss : {0:.2f}".format(avg_train_loss))
        model.eval()
        total_eval_loss = 0


        # Evaluate data for one epoch
        for [batch_x,batch_y] in test_loader:            
            encoding = tokenizer(list(batch_x), return_tensors='pt',max_length = 64, padding=True, truncation=True)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            labels = torch.tensor(batch_y).to(device)

            with torch.no_grad():        
                outputs = model(input_ids, 
                                      token_type_ids=None, 
                                      attention_mask=attention_mask,
                                      labels=labels)
                softmax_fn = torch.nn.Softmax(dim=1)
                softmaxed_output = softmax_fn(outputs.logits)
                loss = F.cross_entropy(softmaxed_output, labels)
                total_eval_loss += loss
                
            # Accumulate the validation loss.
            total_eval_loss += loss.item()

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(test_loader)
        
        print("  avg_val_loss : {0:.2f}".format(avg_val_loss))

        # check total loss
        if prev_loss - total_eval_loss <= 1e-5:
            nondecreasing += 1
        else:
            nondecreasing = 0
        
        if nondecreasing >= early_stop_epochs:
            break
            
        prev_loss = total_eval_loss

    #Test
    model.eval()
    predictions , true_labels = [], []
    for batch in prediction_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch
      with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)
      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      for [el1, el2] in logits:
        if el1 < el2:
          predictions.append(1)
        else:
          predictions.append(0)
      for element in label_ids:
        true_labels.append(element)
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)

    print(precision_recall_fscore_support(true_labels,predictions, average= 'micro'))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


k :  0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

epoch :  0




  avg_train_loss : 0.39




  avg_val_loss : 0.75
epoch :  1


In [None]:
# Training - print F1 score every epoch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split

softmax_fn = torch.nn.Softmax(dim=1) # this returns a function that does softmax
from transformers import AdamW
from torch.nn import functional as F

epochs = 4
early_stop_epochs = 4


k = 1
each_len = len(df_train)//5
x_df = list(df_train['document'])
y_df = list(df_train['star_rating'])

x_test = list(df_test['document'])
y_test = list(df_test['star_rating'])

sentences = x_test
labels = y_test

input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
batch_size = 32  
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


# Main loop
lr = 0.00001
print("k : " , i)
#Validation Set
x_test = x_df[each_len*i:each_len*(i+1)]
x_train = x_df[:each_len*i]+x_df[each_len*(i+1):]
y_test = y_df[each_len*i:each_len*(i+1)]
y_train = y_df[:each_len*i]+y_df[each_len*(i+1):]

# DataLoader
batch_size = 32
train_dataset = [[x_train[i], y_train[i]] for i in range(len(x_train))]
test_dataset = [[x_test[i], y_test[i]] for i in range(len(x_test))]
train_loader = DataLoader(dataset=train_dataset,sampler = RandomSampler(train_dataset), batch_size=batch_size)
test_loader = DataLoader(dataset=test_dataset,sampler = SequentialSampler(test_dataset), batch_size=batch_size)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 5, return_dict=True)
model.cuda()

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)

#scheduler
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                        num_warmup_steps = 0,
                                        num_training_steps = total_steps)

nondecreasing = 0
prev_loss = 9999999
for epoch in range(epochs):  
    print("epoch : " , epoch)   
    total_loss = 0
    model.train()
    for [batch_x, batch_y] in train_loader:
        encoding = tokenizer(list(batch_x), return_tensors='pt',max_length = 64, padding=True, truncation=True)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        labels = torch.tensor(batch_y).to(device)
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        softmax_fn = torch.nn.Softmax(dim=1)
        softmaxed_output = softmax_fn(outputs.logits)
        loss = F.cross_entropy(softmaxed_output, labels)
        total_loss += loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print("  avg_train_loss : {0:.2f}".format(avg_train_loss))
    model.eval()
    total_eval_loss = 0


    # Evaluate data for one epoch
    for [batch_x,batch_y] in test_loader:            
        encoding = tokenizer(list(batch_x), return_tensors='pt',max_length = 64, padding=True, truncation=True)
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        labels = torch.tensor(batch_y).to(device)

        with torch.no_grad():        
            outputs = model(input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=attention_mask,
                                  labels=labels)
            softmax_fn = torch.nn.Softmax(dim=1)
            softmaxed_output = softmax_fn(outputs.logits)
            loss = F.cross_entropy(softmaxed_output, labels)
            total_eval_loss += loss
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(test_loader)
    
    print("  avg_val_loss : {0:.2f}".format(avg_val_loss))

    # check total loss
    if prev_loss - total_eval_loss <= 1e-5:
        nondecreasing += 1
    else:
        nondecreasing = 0
    
    if nondecreasing >= early_stop_epochs:
        break
        
    prev_loss = total_eval_loss

#Test
model.eval()
predictions , true_labels = [], []
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  for [el1, el2] in logits:
    if el1 < el2:
      predictions.append(1)
    else:
      predictions.append(0)
  for element in label_ids:
    true_labels.append(element)
predictions = np.array(predictions)
true_labels = np.array(true_labels)

print(precision_recall_fscore_support(true_labels,predictions, average= 'micro'))


# **[BERT Mulitclass]**

In [17]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

ModuleNotFoundError: ignored

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('pet.csv')

In [None]:
possible_labels = df.star_rating.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}

In [None]:
df['label'] = df.star_rating.replace(label_dict)

In [None]:
from sklearn.model_selection import train_test_split
df, df_75 = train_test_split(df, test_size=0.75, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].document.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=64, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].document.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=64, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [None]:
epochs = 4

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), 'checkpoint.pth')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=4981.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 0.8609638920487995
Validation loss: 0.7809061852195835
F1 Score (Weighted): 0.654710136231064


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=4981.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 0.7415518051552212
Validation loss: 0.7807751192167758
F1 Score (Weighted): 0.6600894124244732


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=4981.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 0.6788264232224142
Validation loss: 0.788721758723937
F1 Score (Weighted): 0.6608015313999952


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=4981.0, style=ProgressStyle(description_wid…


Epoch 4
Training loss: 0.6333755746714156
Validation loss: 0.8135544130826566
F1 Score (Weighted): 0.6599681782349686



In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.load_state_dict(torch.load('checkpoint.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
accuracy_per_class(predictions, true_vals)

Class: 1
Accuracy: 3958/5640

Class: 2
Accuracy: 3227/5614

Class: 3
Accuracy: 3417/5619

Class: 4
Accuracy: 3385/5592

Class: 5
Accuracy: 4583/5660



# **Part3 : Important words**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(stop_words = 'english', min_df = 100)
tfs = tv.fit_transform(df['document'])
feature_names = tv.get_feature_names()
test = " ".join(feature_names)
response = tv.transform([test])
feature_array = np.array(feature_names)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
n = 100
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

In [None]:
for i in range(5):
  tv = TfidfVectorizer(stop_words = 'english', min_df = 100)
  df_label = df[df['star_rating']==i+1]
  tfs = tv.fit_transform(df_label['document'])
  feature_names = tv.get_feature_names()
  test = " ".join(feature_names)
  response = tv.transform([test])
  feature_array = np.array(feature_names)
  tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
  n = 100
  top_n = feature_array[tfidf_sorting][:n]
  print(top_n)

['glued' 'dimensions' 'crack' 'corners' 'stronger' 'raw' 'passed'
 'willing' 'obvious' 'beef' 'assembled' 'manual' 'video' 'removing'
 'refill' 'measured' 'wild' 'beautiful' 'ad' 'lowest' 'stitching' 'stiff'
 'stain' 'bully' 'harder' '2014' 'blade' 'wrap' 'bills' 'tall' 'kids'
 'reviewer' 'oz' 'print' 'temp' 'arrive' 'cleaner' 'wrapped' 'son'
 'ruined' 'rice' 'clasp' 'posted' 'numerous' 'shop' 'page' 'pig' 'hook'
 'tanks' 'current' 'fat' 'sealed' 'burn' 'lightweight' 'bend' 'durability'
 'follow' 'outdoor' 'kibble' 'string' 'types' 'grain' 'vibrate' 'english'
 'duck' 'tug' 'guinea' 'heart' 'shelf' 'indestructible' 'asking' 'choke'
 'refuse' 'tossed' 'directed' 'advise' 'instantly' 'amazing' 'worry'
 'feature' 'sat' 'applying' 'colors' 'vibration' 'crappy' 'danger' '17'
 'lethargic' 'kitchen' 'covers' 'butter' 'talking' 'refunded' 'terribly'
 'beds' 'smelling' 'duty' 'vets' 'writing' 'recent']
['miss' 'inexpensive' 'supplement' 'lick' 'appropriate' 'attachment'
 'mice' 'fetch' 'outer' '