In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1fe26bc6bf0>

In [2]:
df = pd.read_csv('./data/googleplaystore_user_reviews.csv')
df.sample(5)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
19597,"CBS Sports App - Scores, News, Stats & Watch Live",Another thumbs latest update. Bring back My Sc...,Positive,0.25,0.45
5476,"AliExpress - Smarter Shopping, Better Living",,,,
40979,"Eve Period Tracker - Love, Sex & Relationships...",,,,
9639,Avakin Life - 3D virtual world,,,,
19803,CBS Sports Fantasy,I like helps fantasy football needs,Neutral,0.0,0.0


In [3]:
df.shape

(64295, 5)

In [4]:
df = df.dropna()
df.shape

(37427, 5)

In [5]:
def to_sentiment(rating):
    if rating == 'Positive':
        return 2
    elif rating == 'Neutral':
        return 1
    return 0

In [6]:
df['sentiment'] = df.Sentiment.apply(to_sentiment)

class_names = ['Negative', 'Neutral', 'Positive']

In [7]:
df['sentiment'].value_counts()

2    23998
0     8271
1     5158
Name: sentiment, dtype: int64

In [8]:
df_train, df_test = train_test_split(df, test_size = 0.1, random_state = RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size = 0.5, random_state = RANDOM_SEED)

In [9]:
class GPReviewDataset(Dataset):
    
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens = True,
            truncation=True,
            max_length = self.max_len,
            return_token_type_ids = False,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
            
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype = torch.long)
        }

In [10]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews = df.Translated_Review.to_numpy(),
        targets = df.sentiment.to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )
    return DataLoader(ds, batch_size = batch_size, shuffle = True)

In [11]:
BATCH_SIZE = 32
MAX_LEN = 160
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [12]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids = input_ids, attention_mask = attention_mask)  # 第1个是loss
#         print(f"bert_out: {bert_output.last_hidden_state}")
#         print(f'last_hidden_state: {bert_output.last_hidden_state.size()}')
#         print(f'pooler_output: {bert_output.pooler_output.size()}')
#         print(f'input_ids: {input_ids.size()}')
        first_token_tensor = bert_output.pooler_output
        output = self.drop(first_token_tensor)
        return self.out(output)

In [13]:
model = SentimentClassifier(len(class_names))
model = model.to(DEVICE)

EPOCHS = 10
optimizer = AdamW(model.parameters(), lr = 5e-5, correct_bias = False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss().to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(DEVICE)
        
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        _, preds = torch.max(outputs, dim = 1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        print(f'loss: {loss}')
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [15]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [16]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1} / {EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        DEVICE,
        scheduler,
        len(df_train)
    )
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        DEVICE,
        len(df_val)
    )
    
    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), './output/best_model_state_sentiment_analysis.bin')
        best_accuracy = val_acc

Epoch 1 / 10
----------
loss: 1.0998482704162598
loss: 1.0257536172866821
loss: 0.8555214405059814
loss: 1.0794246196746826
loss: 0.8290035724639893
loss: 1.1954704523086548
loss: 0.9228551983833313
loss: 0.932999312877655
loss: 1.2336902618408203
loss: 1.0865261554718018
loss: 0.8022121787071228
loss: 0.8710868954658508
loss: 0.8731654286384583
loss: 0.8886054754257202
loss: 0.6363863945007324
loss: 0.8209210634231567
loss: 0.7745426893234253
loss: 0.7188626527786255
loss: 0.623023509979248
loss: 0.514590322971344
loss: 0.9388572573661804
loss: 0.558646023273468
loss: 0.4408569931983948
loss: 0.46329036355018616
loss: 0.5616952776908875
loss: 0.7187432646751404
loss: 0.6439228653907776
loss: 0.6267889738082886
loss: 0.6478189826011658
loss: 0.6870012879371643
loss: 0.776525616645813
loss: 0.8739638328552246
loss: 0.9963911771774292
loss: 0.7910602688789368
loss: 0.7546226382255554
loss: 0.6884870529174805
loss: 0.7469612956047058
loss: 0.9563114643096924
loss: 0.7594798803329468
loss:

loss: 0.37872254848480225
loss: 0.40130415558815
loss: 0.17757554352283478
loss: 0.3654423952102661
loss: 0.2542516887187958
loss: 0.234462171792984
loss: 0.2518489956855774
loss: 0.24082051217556
loss: 0.8012966513633728
loss: 0.37354370951652527
loss: 0.20881789922714233
loss: 0.362024188041687
loss: 0.4565296173095703
loss: 0.1774483174085617
loss: 0.6393144726753235
loss: 0.4533521234989166
loss: 0.4030841290950775
loss: 0.39387187361717224
loss: 0.31110256910324097
loss: 0.49345630407333374
loss: 0.32125985622406006
loss: 0.5770620107650757
loss: 0.3079553544521332
loss: 0.30774471163749695
loss: 0.5273316502571106
loss: 0.4616997539997101
loss: 0.2671037018299103
loss: 0.126154825091362
loss: 0.32338041067123413
loss: 0.345729261636734
loss: 0.565685510635376
loss: 0.46108904480934143
loss: 0.4002176523208618
loss: 0.48269960284233093
loss: 0.2812097668647766
loss: 0.4920036792755127
loss: 0.3341120779514313
loss: 0.3091096878051758
loss: 0.3228503465652466
loss: 0.40493699908256

loss: 0.5118144750595093
loss: 0.28045228123664856
loss: 0.3010563254356384
loss: 0.5638068318367004
loss: 0.3730457127094269
loss: 0.32644888758659363
loss: 0.48367175459861755
loss: 0.26214277744293213
loss: 0.10099417716264725
loss: 0.23013441264629364
loss: 0.24648432433605194
loss: 0.1591026484966278
loss: 0.5429048538208008
loss: 0.46341872215270996
loss: 0.5609012842178345
loss: 0.17978313565254211
loss: 0.3208411931991577
loss: 0.1816805750131607
loss: 0.25484800338745117
loss: 0.5447784662246704
loss: 0.2510116994380951
loss: 0.14974543452262878
loss: 0.311143696308136
loss: 0.26976948976516724
loss: 0.2073831409215927
loss: 0.2715219557285309
loss: 0.2183004468679428
loss: 0.47295045852661133
loss: 0.11962057650089264
loss: 0.3567597568035126
loss: 0.31401410698890686
loss: 0.08785401284694672
loss: 0.9298037886619568
loss: 0.1893470138311386
loss: 0.24222244322299957
loss: 0.6867036819458008
loss: 0.5730746388435364
loss: 0.0911240354180336
loss: 0.39112746715545654
loss: 0.

loss: 0.20863237977027893
loss: 0.46015414595603943
loss: 0.3810824155807495
loss: 0.3067469894886017
loss: 0.4376849830150604
loss: 0.09203136712312698
loss: 0.38371723890304565
loss: 0.3144679367542267
loss: 0.056572262197732925
loss: 0.26599428057670593
loss: 0.22117403149604797
loss: 0.038384970277547836
loss: 0.46516862511634827
loss: 0.23790960013866425
loss: 0.6048957109451294
loss: 0.2064603567123413
loss: 0.06574566662311554
loss: 0.24415719509124756
loss: 0.07812011241912842
loss: 0.07763116806745529
loss: 0.48888421058654785
loss: 0.3533344268798828
loss: 0.23894521594047546
loss: 0.24710267782211304
loss: 0.19971421360969543
loss: 0.23147860169410706
loss: 0.6027262806892395
loss: 0.2758195102214813
loss: 0.15562167763710022
loss: 0.13099297881126404
loss: 0.12305864691734314
loss: 0.15066266059875488
loss: 0.043217603117227554
loss: 0.4766846299171448
loss: 0.6068283915519714
loss: 0.3966781198978424
loss: 0.31823211908340454
loss: 0.06393805146217346
loss: 0.3460142314434

NameError: name 'correct_prredictions' is not defined