In [85]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from sklearn.metrics import f1_score
import os
import torch
from sklearn.metrics.pairwise import cosine_similarity

### Предсказание модели для тестовой выборки

In [12]:
import re

def clean_text(x):
    x = x.lower()
    x = replace_contractions(x)
    pattern = r'[^a-zA-z0-9\s]'
    x = re.sub(pattern, '', x)
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

In [87]:
train_solution = pd.read_csv("train_solution.csv")
train_data = pd.read_csv("train_data.csv")
train = train_data.merge(train_solution, on='id')
train['tokenized'] = train['message'].apply(lambda x: clean_text(x))
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(train['tokenized'], train['category'],
                                                    stratify=train['category'], 
                                                    test_size=0.25, random_state=42)
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(train_y.values)
test_y = le.transform(test_y.values)

In [97]:
max_features = 16958
embed_size = 300
maxlen = 3160

class CNN_Text(nn.Module):
    def __init__(self):
        super(CNN_Text, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        n_classes = 3
        self.embedding = nn.Embedding(max_features, embed_size)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(0.15)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, n_classes)

    def forward(self, x):
        x = self.embedding(x) 
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)  
        logit = self.fc1(x) 
        return logit
    
    #эмбеддинг текста
    def get_embedding(self,x):
        x = self.embedding(x) 
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        return x

In [99]:
#тестирование модели на CPU
model = CNN_Text()
model.load_state_dict(torch.load("/content/model_new1.pt", map_location=torch.device('cpu')))  # Choose whatever GPU device number you want
model.eval()

CNN_Text(
  (embedding): Embedding(16958, 300)
  (convs1): ModuleList(
    (0): Conv2d(1, 36, kernel_size=(1, 300), stride=(1, 1))
    (1): Conv2d(1, 36, kernel_size=(2, 300), stride=(1, 1))
    (2): Conv2d(1, 36, kernel_size=(3, 300), stride=(1, 1))
    (3): Conv2d(1, 36, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.15, inplace=False)
  (fc1): Linear(in_features=144, out_features=3, bias=True)
)

In [95]:
def predict(x):    
    x = clean_text(x)
    x = tokenizer.texts_to_sequences([x])
    x = pad_sequences(x, maxlen=maxlen)
    x = torch.tensor(x, dtype=torch.long)
    pred = model(x).detach()
    pred = F.softmax(pred).cpu().numpy()
    pred = pred.argmax(axis=1)
    pred = le.classes_[pred]
    return pred[0]

In [103]:
test_data_new = pd.read_csv("test_data.csv")

In [None]:
test_data_new['category'] = test_data_new['message'].apply(lambda x: predict(x))

In [None]:
test_data_new = test_data_new.drop(columns = ["message"])
compression_opts = dict(method='zip',
                        archive_name='test_data.csv')  
test_data_new.to_csv('test_data2.zip', index=False,
          compression=compression_opts)  

### Получение эмбеддингов текстов из обучающей выборки и эмбеддинга для фразы "My future"

In [92]:
#Эмбеддинг текста
def get_emb(x, is_cleaned=False):
  if not is_cleaned:
    x = clean_text(x)
  x = tokenizer.texts_to_sequences([x])
  x = pad_sequences(x, maxlen=maxlen)
  x = torch.tensor(x, dtype=torch.long)
  return model.get_embedding(x)

In [83]:
my_future_emb = get_emb("My future")

Признаковое предствление для фразы "My future"

In [84]:
my_future_emb

tensor([[0.5005, 0.7324, 0.0200, 0.0000, 0.4328, 0.0874, 0.6357, 0.0000, 0.4468,
         0.0000, 0.5672, 0.6594, 2.5821, 0.3509, 0.5934, 0.0000, 0.6595, 3.3656,
         0.0000, 1.1884, 0.1779, 0.5806, 1.1113, 0.0000, 0.8545, 0.4795, 0.0000,
         3.9702, 0.2404, 0.6566, 0.0745, 0.6073, 0.3120, 0.1291, 0.6322, 0.0000,
         3.3630, 1.5500, 0.5827, 0.1330, 0.0844, 0.7993, 0.3649, 2.4431, 0.0544,
         0.0000, 1.3672, 0.0000, 0.8623, 0.0000, 0.2996, 0.0000, 0.1910, 2.3301,
         2.8495, 1.8663, 0.1881, 0.3238, 0.0535, 0.0000, 1.3020, 0.5230, 0.0000,
         0.8869, 0.5742, 0.8364, 1.8859, 2.1713, 0.0692, 2.4143, 2.0217, 0.0310,
         2.1670, 1.3856, 1.0410, 2.3380, 0.2244, 0.0000, 0.0000, 0.0000, 3.3635,
         0.0000, 2.6294, 0.0000, 0.0000, 1.9361, 0.3242, 0.0000, 1.6005, 0.9955,
         0.0000, 0.0000, 1.8530, 0.3508, 1.8784, 0.8346, 0.0000, 0.0186, 3.8435,
         0.0000, 0.6887, 0.9910, 0.0000, 0.0000, 0.3850, 0.0000, 0.0000, 0.0000,
         0.0000, 2.0138, 0.0

In [69]:
#Получаем эмбеддинги текстов из обучающей выборки
emb_of_texts = {}
for index, row in train.iterrows():
  emb_of_texts[row['id']] = get_emb(row['tokenized'], is_cleaned=True)

In [77]:
#Ближайший текст к исходному тексту - тот, чье косинусное сходство с исходным текстом максимально
#аналог most similar в word2vec
def get_most_similar(word):
  max_similarity = 0
  nearest_text = ""
  word_emb = get_emb(word)
  for index, row in train.iterrows():
    emb_of_text = emb_of_texts[row['id']]
    similarity = cosine_similarity(word_emb, emb_of_text)[0][0]
    if similarity > max_similarity:
      nearest_text = row['message']
      max_similarity = similarity
  
  return nearest_text, max_similarity

In [78]:
nearest_text, max_similarity = get_most_similar("My future")

In [79]:
nearest_text

'my bad'

In [80]:
max_similarity

0.9678832

Надеюсь, модель врет))