In [1]:
from tensorboardX import SummaryWriter

In [2]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [3]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [5]:
import random
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [7]:
# TEXT.build_vocab(train_data, max_size=25000)
# LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [8]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [9]:
BATCH_SIZE = 64

device = torch.device('cuda:3')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device,
    repeat=False)

In [11]:
batch = next(iter(train_iterator))

In [12]:
batch.text

tensor([[ 148,   11,   14,  ...,   11, 3136,    0],
        [2289,  170, 7425,  ...,   76,  494,   23],
        [   3,   82,    8,  ...,  617,   25,   65],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:3')

In [13]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
mask = batch.text == PAD_IDX
mask

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:3', dtype=torch.uint8)

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text, mask):
        embedded = self.embedding(text) # [batch_size, seq_len, emb_dim]
        sent_embed = torch.sum(embedded * mask.unsqueeze(2), 1) / mask.sum(1).unsqueeze(1) # [batch size, embedding_dim]
        return self.fc(sent_embed)

In [15]:
writer = SummaryWriter('./logss/')

In [16]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1

dummy_input = torch.rand(13, 1, 28)

model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,301 trainable parameters


In [18]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4205, -0.2041,  0.0900,  ...,  0.5825, -0.5669,  0.2929],
        [ 0.3209,  0.5566,  0.1307,  ...,  0.1149, -0.3990,  0.1511],
        [ 0.8599, -0.3779, -0.1766,  ...,  0.8575,  0.5670, -0.9255]])

In [19]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [22]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    i = 0
    for batch in iterator:
        optimizer.zero_grad()
        # CHANGED
        text = batch.text.permute(1, 0) # [batch_size, seq_length]
        mask = 1. - (text == PAD_IDX).float() # [batch_size, seq_len]
        predictions = model(text, mask).squeeze(1)
        loss = criterion(predictions, batch.label.float())
        acc = binary_accuracy(predictions, batch.label.float())

        
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            print("batch {}, loss {}".format(i, loss.item()))
        i += 1
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

            
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    i = 0
    with torch.no_grad():
        for batch in iterator:
            text = batch.text.permute(1, 0) # [batch_size, seq_length]
            mask = 1. - (text == PAD_IDX).float() # [batch_size, seq_len]
            predictions = model(text, mask).squeeze(1)
            loss = criterion(predictions, batch.label.float())
            acc = binary_accuracy(predictions, batch.label.float())
            
            if i % 100 == 0:
                print("batch {}, loss {}".format(i, loss.item()))
            i += 1
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [25]:
N_EPOCHS = 6

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    writer.add_scalar('Evaluate_Accuracy',valid_acc, epoch)
    writer.add_scalar('Train_Accuracy',train_acc, epoch)
    writer.add_scalar('Evaluate_loss',valid_loss, epoch)
    writer.add_scalar('Train_loss',valid_loss, epoch)
    for name, param in model.named_parameters():
            writer.add_histogram(name + "_model", param.clone().cpu().data.numpy(), epoch)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'wordavg-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

batch 0, loss 0.6932681202888489
batch 100, loss 0.6702615022659302
batch 200, loss 0.6430895328521729
batch 0, loss 0.6194495558738708
batch 100, loss 0.6296879649162292
Epoch: 01 | Epoch Time: 0m 5s
	Train Loss: 0.663 | Train Acc: 67.02%
	 Val. Loss: 0.619 |  Val. Acc: 73.95%
batch 0, loss 0.6113342046737671
batch 100, loss 0.5600838661193848
batch 200, loss 0.5703293681144714
batch 0, loss 0.4727885127067566
batch 100, loss 0.5243251323699951
Epoch: 02 | Epoch Time: 0m 5s
	Train Loss: 0.545 | Train Acc: 78.95%
	 Val. Loss: 0.487 |  Val. Acc: 81.93%
batch 0, loss 0.3885486423969269
batch 100, loss 0.4778684377670288
batch 200, loss 0.4177061915397644
batch 0, loss 0.3946606516838074
batch 100, loss 0.4502289295196533
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.418 | Train Acc: 84.96%
	 Val. Loss: 0.394 |  Val. Acc: 85.27%
batch 0, loss 0.35466891527175903
batch 100, loss 0.37387752532958984
batch 200, loss 0.2992222011089325
batch 0, loss 0.40034162998199463
batch 100, loss 0.398997

In [None]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    text = tensor.unsqueeze(0)
    mask = 1. - (text == PAD_IDX).float() # [batch_size, seq_len]
    prediction = torch.sigmoid(model(tensor, mask))
    return prediction.item()


In [32]:
writer4 = SummaryWriter(logdir='./log/',comment='model1')

In [19]:
import torch.nn as nn
import torch.nn.functional as F

In [20]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, 
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.filter_sizes = filter_sizes
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, mask):
        # CHANGED
        embedded = self.embedding(text) # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1) # [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
#         print((1.-mask[:, :-3+1]).unsqueeze(1).byte().shape)
        conved = [conv.masked_fill((1.-mask[:, :-filter_size+1]).unsqueeze(1).byte(), -999999) \
                  for (conv, filter_size) in zip(conved, self.filter_sizes)]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [30]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model = model.to(device)

In [33]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

N_EPOCHS = 6

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    writer4.add_scalar('Evaluate Accuracy',valid_acc, epoch)
    writer4.add_scalar('Train Accuracy',train_acc, epoch)
    for name, param in model.named_parameters():
            writer4.add_histogram(name + "_model", param.clone().cpu().data.numpy(), epoch)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

batch 0, loss 0.5202465653419495
batch 100, loss 0.36336982250213623
batch 200, loss 0.5160465240478516
batch 0, loss 0.3318127989768982
batch 100, loss 0.3935736417770386
Epoch: 01 | Epoch Time: 2m 14s
	Train Loss: 0.440 | Train Acc: 79.70%
	 Val. Loss: 0.372 |  Val. Acc: 83.90%
batch 0, loss 0.2637452185153961
batch 100, loss 0.22968274354934692
batch 200, loss 0.3143852651119232
batch 0, loss 0.2856375277042389
batch 100, loss 0.321858674287796
Epoch: 02 | Epoch Time: 2m 13s
	Train Loss: 0.318 | Train Acc: 86.66%
	 Val. Loss: 0.332 |  Val. Acc: 86.01%
batch 0, loss 0.2224854975938797
batch 100, loss 0.3282856345176697
batch 200, loss 0.17007586359977722
batch 0, loss 0.2776656150817871
batch 100, loss 0.311362624168396
Epoch: 03 | Epoch Time: 2m 12s
	Train Loss: 0.231 | Train Acc: 90.86%
	 Val. Loss: 0.322 |  Val. Acc: 86.69%
batch 0, loss 0.1589740514755249
batch 100, loss 0.09488784521818161
batch 200, loss 0.19736722111701965
batch 0, loss 0.26881128549575806
batch 100, loss 0.29

In [26]:
predict_sentiment("I love it")

1.0

In [50]:
predict_sentiment("It is bigger than I thought")

0.9999828338623047

In [1]:
import pandas as pd

In [2]:
microwave_dat= pd.read_csv('hair_dryer_classify.csv')
microwave_dat.head()

Unnamed: 0,review_headline,review_body
0,Works great,Works great!
1,I love travel blow dryers because they are eas...,"This dries my hair faster that bigger, more po..."
2,Five Stars,Love this dryer!
3,Five Stars,styling hair in style
4,I think's great. The cord length is perfect,I just got this last week. I think's great. Th...


In [10]:
tep=microwave_dat.iloc[:,1:]

In [12]:
tep2=pd.Series(microwave_dat['review_headline'])

In [13]:
tep2.head()

0                                          Works great
1    I love travel blow dryers because they are eas...
2                                           Five Stars
3                                           Five Stars
4          I think's great. The cord length is perfect
Name: review_headline, dtype: object

In [86]:
microwave_dat.review_body[1]

"This dries my hair faster that bigger, more powerful models. I love travel blow dryers because they are easy to lift and they usually come in onesixzerozero w or less. Bigger dryers are heavy and blow my hair everywhere. This has a surprising amount of power and is very compact. I would give it a five except that the switch is not easy to turn on and off with one hand and it's noisier than I anticipated."

In [87]:
predict_sentiment(microwave_dat.review_body[1])

0.7890385985374451

In [88]:
len(microwave_dat.review_body)

11470

In [89]:
microwave_dat['classify_of_body']='11'

In [90]:
microwave_dat['classify_of_title']='12'

In [91]:
microwave_dat.head()

Unnamed: 0,review_headline,review_body,classify_of_body,classify_of_title
0,Works great,Works great!,11,12
1,I love travel blow dryers because they are eas...,"This dries my hair faster that bigger, more po...",11,12
2,Five Stars,Love this dryer!,11,12
3,Five Stars,styling hair in style,11,12
4,I think's great. The cord length is perfect,I just got this last week. I think's great. Th...,11,12


In [96]:
class_for_mw_title=list()
class_for_mw_body=list()

In [97]:
len(class_for_mw_body)

0

In [98]:
for i in range(0,len(microwave_dat.review_body)):
    try:
        class_for_mw_body.append((predict_sentiment(microwave_dat.review_body[i])))
    except TypeError:
        print("line in body:",i)
        class_for_mw_body.append(999)
    try:
        class_for_mw_title.append((predict_sentiment(microwave_dat.review_headline[i])))
    except TypeError:
        print("line in title:",i)
        class_for_mw_title.append(999)
    if (i%500)==0:
        print("current:",i)

current: 0
line in title: 273
current: 500
current: 1000
current: 1500
current: 2000
line in title: 2433
current: 2500
current: 3000
current: 3500
current: 4000
current: 4500
current: 5000
current: 5500
current: 6000
current: 6500
current: 7000
current: 7500
current: 8000
current: 8500
current: 9000
current: 9500
current: 10000
current: 10500
current: 11000


In [99]:
microwave_dat['classify_of_body']=class_for_mw_body
microwave_dat['classify_of_title']=class_for_mw_title

In [95]:
len(class_for_mw_body)

11472

In [100]:
microwave_dat.head()

Unnamed: 0,review_headline,review_body,classify_of_body,classify_of_title
0,Works great,Works great!,1.0,1.0
1,I love travel blow dryers because they are eas...,"This dries my hair faster that bigger, more po...",0.789039,0.984481
2,Five Stars,Love this dryer!,0.050988,0.885015
3,Five Stars,styling hair in style,0.077389,0.885015
4,I think's great. The cord length is perfect,I just got this last week. I think's great. Th...,1.0,1.0


In [101]:
microwave_dat.to_csv("final_final_hair_dryer_dat.csv")

In [1]:
import pandas as pd

In [3]:
microwave_dat= pd.read_csv('my_hair_dryer_dat.csv')
microwave_dat.head()

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 8-9: invalid continuation byte