In [216]:
import os
import numpy as np
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

In [217]:
data=pd.read_excel('canibalisation_set2.xlsx')[['str', 'mark']].drop_duplicates(keep='first').reset_index()

In [218]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39919 entries, 0 to 39918
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   39919 non-null  int64 
 1   str     39919 non-null  object
 2   mark    39919 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 935.7+ KB


In [219]:
from string import punctuation

def preprocess(text):
    text = text.lower()
    #text = ",".join([ch for ch in text])
    #text=text.replace("\t", "")
    all_reviews = text
    all_words = text.split(',')
    all_words = [word for word in all_words if word!='']
    return all_reviews, all_words

In [220]:
all_words=[]
for i in range(len(data)):
    all_words=all_words+preprocess(data['str'][i])[1]

In [221]:
all_words[:20]

['-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 'xада',
 '-',
 '-',
 '-']

In [223]:
data0=data[data['mark']==0]
data1=data[data['mark']==1]
data2=data[data['mark']==2]
print(len(data1), len(data2), len(data3))

3543 3351 19131


In [224]:
data_full=data1.append(data0[:3351])
data_full=data_full.append(data2[:3351])
data_full=shuffle(data_full, random_state=42).reset_index().drop('index', axis=1)

In [225]:
def text_labels_separate(train):
    labels=[]
    text=[]
    for i in range(len(train)):
        if train['mark'][i] == 0:
            labels.append(0)#субсид
            text.append(preprocess(train['str'][i])[0])
        elif train['mark'][i] == 1:
            labels.append(1)#
            text.append(preprocess(train['str'][i])[0])
        else: 
            labels.append(2)#
            text.append(preprocess(train['str'][i])[0])
       
    return text, labels
            

In [226]:
train_texts, train_labels= text_labels_separate(data_full)

In [227]:


word_counts = Counter(all_words)
word_list = sorted(word_counts, key=word_counts.get, reverse = True)


In [228]:
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}


In [229]:
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}


In [230]:
vocab_to_int

{'-': 1,
 'да': 2,
 'др': 3,
 'xа': 4,
 'р': 5,
 'а': 6,
 'ада': 7,
 'xада': 8,
 'рдр': 9,
 'дрда': 10,
 'xадр': 11,
 'xадрда': 12,
 'адр': 13,
 'адрда': 14,
 'рда': 15,
 'рдрда': 16}

In [231]:
train_texts[0]

'-,-,да,ада,рдр,-,-,рдр,да,xада,да,-,-,да,-,да,-,-,да,рдр,-,др'

In [232]:
encoded_train = [[vocab_to_int[word] for word in review.split(',') if word!=''] for review in train_texts]

In [233]:
encoded_train[0]

[1, 1, 2, 7, 9, 1, 1, 9, 2, 8, 2, 1, 1, 2, 1, 2, 1, 1, 2, 9, 1, 3]

In [234]:
length = []
for review in encoded_train:
    length.append(len(review))

print('max:', max(length), 'mean:' , np.mean(length), np.std(length))

max: 106 mean: 23.576476329917032 16.10757919263777


In [235]:
def pad_text(encoded_train, length):
    
    strings = []
    
    for string in encoded_train:
        if len(string) <= length:
            strings.append([0]*(length-len(string)) + string )
        else:
            strings.append(string[:length])
        
    return np.array(strings)


padded_train = pad_text(encoded_train, length = 106)

In [236]:
padded_train[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 7,
       9, 1, 1, 9, 2, 8, 2, 1, 1, 2, 1, 2, 1, 1, 2, 9, 1, 3])

In [237]:
train_labels = np.array( [label for idx, label in enumerate(train_labels) if len(padded_train[idx]) > 0] ).astype(float)
strings_train = [string for string in padded_train if len(string) > 0]

In [238]:
valid_ratio = 0.2
test_ratio=0.2
total = len(strings_train)
train_cutoff = int(total * (1- (test_ratio+valid_ratio)))
valid_cutoff = int(total* (1 - test_ratio))

train_x, train_y = torch.Tensor(strings_train[:train_cutoff]), torch.Tensor(train_labels[:train_cutoff])
valid_x, valid_y = torch.Tensor(strings_train[train_cutoff:valid_cutoff]), torch.Tensor(train_labels[train_cutoff:valid_cutoff])
test_x, test_y = torch.Tensor(strings_train[valid_cutoff:]), torch.Tensor(train_labels[valid_cutoff:])



In [239]:
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)



In [240]:
batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [241]:
from torch import nn

class CNN(nn.Module): 
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_hidden = n_hidden   # number of layers 
          
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.layer1 = nn.Sequential( nn.Conv1d(106, 250, kernel_size=2, stride=1), 
            nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential( nn.Conv1d(250, 100, kernel_size=5, stride=4, padding=2), 
            nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2))
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(n_hidden, n_output)
        #self.sigmoid = nn.Sigmoid()
        self.soft=nn.LogSoftmax()
    
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        out = self.layer1(embedded_words)          # (batch_size, seq_length, n_kernel)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)  # (batch_size*seq_length, n_hidden)
        out = self.dropout(out) 
        fc_out = self.fc(out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.soft(fc_out)              # (batch_size*seq_length, n_output)
        #sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        #sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_out
    
    

In [242]:
n_vocab = len(vocab_to_int)+1
n_embed = 50
n_hidden = 300
n_output = 3   

In [243]:
from torch import optim
net = CNN(n_vocab, n_embed, n_hidden, n_output)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001 )


In [244]:
print_every = 200
step = 0
n_epochs = 100  # validation loss increases from ~ epoch 3 or 4
#clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device).long(), labels.to(device)
        
        
        
        
        net.zero_grad()
        output= net(inputs)
        loss = criterion(output, labels.long())
        loss.backward()
        #nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device).long(), labels.to(device)
        
                
                
                v_output = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.long())
                valid_losses.append(v_loss.item())
                

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()





Epoch: 2/100 Step: 200 Training Loss: 0.3851 Validation Loss: 0.3449
Epoch: 4/100 Step: 400 Training Loss: 0.3326 Validation Loss: 0.3190
Epoch: 5/100 Step: 600 Training Loss: 0.1790 Validation Loss: 0.1509
Epoch: 7/100 Step: 800 Training Loss: 0.3452 Validation Loss: 0.2726
Epoch: 9/100 Step: 1000 Training Loss: 0.2941 Validation Loss: 0.2227
Epoch: 10/100 Step: 1200 Training Loss: 0.2820 Validation Loss: 0.2049
Epoch: 12/100 Step: 1400 Training Loss: 0.1584 Validation Loss: 0.0696
Epoch: 14/100 Step: 1600 Training Loss: 0.1896 Validation Loss: 0.1036
Epoch: 15/100 Step: 1800 Training Loss: 0.1637 Validation Loss: 0.0626
Epoch: 17/100 Step: 2000 Training Loss: 0.1447 Validation Loss: 0.1183
Epoch: 18/100 Step: 2200 Training Loss: 0.0811 Validation Loss: 0.0773
Epoch: 20/100 Step: 2400 Training Loss: 0.0843 Validation Loss: 0.0461
Epoch: 22/100 Step: 2600 Training Loss: 0.0792 Validation Loss: 0.0173
Epoch: 23/100 Step: 2800 Training Loss: 0.0554 Validation Loss: 0.0291
Epoch: 25/100 S

In [245]:
net.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for test_in, labels in test_loader:
        outputs = net(test_in.long())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the', total, 'test inputs: {} %'.format((correct / total) * 100))




Test Accuracy of the model on the 2049 test inputs: 95.55880917520741 %


In [246]:
data3=data[data['mark']==3].reset_index()
for i in range(len(data0)):
    zero=preprocess(data0['str'][i])[0]
    encoded_zero = [vocab_to_int[word] for word in zero.split(',') if word!=''] 
    length = 106
    padded_zero =np.array(( [0]*(length-len(encoded_zero)) + encoded_zero))
    net.eval()
    with torch.no_grad():
        output = net(torch.from_numpy(padded_zero).unsqueeze(0).long()).data.view(3)
        
    print('out probabilities: {:.2f} %, {:.2f} %, {:.2f} % '.format(np.exp(output)[0], np.exp(output)[1], np.exp(output)[2]))

out probabilities: 1.00 %, 0.00 %, 0.00 % 
out probabilities: 1.00 %, 0.00 %, 0.00 % 




KeyError: 2

In [247]:
#torch.save(net.state_dict(), 'canibalisation_cnn.pth')

In [99]:
zero=preprocess(data0['str'][66488])[0]

In [100]:
encoded_zero = [vocab_to_int[word] for word in zero.split(',,')] 

In [101]:
length = 106
padded_zero =np.array(( [0]*(length-len(encoded_zero)) + encoded_zero))


In [102]:
net.eval()
with torch.no_grad():
    output = net(torch.from_numpy(padded_zero).unsqueeze(0).long()).data
print(np.exp(output))

tensor([[1.0000e+00, 1.7423e-20, 1.9974e-16]])


