In [1]:
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm
import numpy as  np
import os, random
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
from nltk.stem import SnowballStemmer

model = r"SBW-vectors-300-min5.txt"
train=r"./data/haha_2019_train_preprocessed_lemmatized.csv"
test=r"./data/haha_2019_test_preprocessed_lemmatized.csv"

[nltk_data] Downloading package punkt to /home/ors/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#text preprocessing
stemmer = SnowballStemmer('spanish')

def clean_text(text):
    text = text.replace("\\", " ").replace(u"╚", " ").replace(u"╩", " ")
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', '', text) 
    text = re.sub('[¡¿.,:;_%©?*,!@#$%^&()\d]|[+=]|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    words = text.split()
    words = [w for w in words if len(w)>=3]
    stop_words = set(stopwords.words('spanish'))
    words = [w for w in words if not w in stop_words]
    text=' '.join(words)
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(i) for i in tokens]
    return stemmed

In [3]:
#read csv

values= pd.read_csv(train, sep=',', header=None, encoding = 'utf-8-sig').values
np.random.seed(42)
np.random.shuffle(values)
#df=pd.DataFrame(values)

m = len(values)

train_length = int(0.9 * m)
train_data, test_data = values[:train_length], values[train_length:]

df=pd.DataFrame(train_data)

texts_train=df[1].tolist()
scores_train=df[9].tolist()
categories_train=[1 if str(s)!='nan' else 0 for s in scores_train]

df=pd.DataFrame(test_data)

texts_test=df[1].tolist()
scores_test=df[9].tolist()
categories_test=[1 if str(s)!='nan' else 0 for s in scores_test]
'''
texts=df[1].tolist()
scores=df[9].tolist()

categories=[1 if str(s)!='nan' else 0 for s in scores]

df = pd.read_csv(test, sep=',', header=None)
texts_test=df[1].tolist()
'''

"\ntexts=df[1].tolist()\nscores=df[9].tolist()\n\ncategories=[1 if str(s)!='nan' else 0 for s in scores]\n\ndf = pd.read_csv(test, sep=',', header=None)\ntexts_test=df[1].tolist()\n"

In [4]:
words=set()#set of all words
for text in texts_train:
    words_text=clean_text(text)
    words.update(words_text)
print("number of words: {0}".format(len(words)))

number of words: 15613


In [5]:
embdict=dict()#dictionary for words and emb
index=0

with open(model,'r',encoding = 'utf-8-sig')as f:
    header = f.readline()
    vocab_size, layer1_size = map(int, header.split())
    binary_len = np.dtype('float32').itemsize * layer1_size
    for line in range(vocab_size):
        word=str(f.readline()).replace('b','').replace('\'','').replace('\\n','').lower().split()
        w = stemmer.stem(word[0])
        if w in words:
            word.remove(word[0])
            emb = [float(x) for x in word]
            embdict[str(w)]=emb
        index+=1
        if index%100000==0:
            print("iteration "+str(index))

print("size of dictionary: {0}".format(len(embdict)))

iteration 100000
iteration 200000
iteration 300000
iteration 400000
iteration 500000
iteration 600000
iteration 700000
iteration 800000
iteration 900000
iteration 1000000
size of dictionary: 11689


<h1>сеть

In [10]:
import tensorflow.keras as keras
import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import copy

In [17]:
num_classes = 2

prep_texts_train =[]
prep_texts_test =[]

print("предобработка текста")
for t in texts_train:
    prep_texts_train.append(' '.join(clean_text(t)))
for t in texts_test:
    prep_texts_test.append(' '.join(clean_text(t)))
    
descriptions = prep_texts_train
    
x_train = prep_texts_train
y_train = categories_train
    
x_test = prep_texts_test
y_test = categories_test


y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

max_words = 0
for desc in descriptions:
    try:
        words = len(desc)
        if words > max_words:
            max_words = words
    except:
        pass
print('Максимальное количество слов в самом длинном тексте: {} слов'.format(max_words))

maxSequenceLength = max_words

t = Tokenizer()
    
t.fit_on_texts(descriptions)
vocab_size = len(t.word_index) + 1

encoded_docs_train = t.texts_to_sequences(x_train)
encoded_docs_test = t.texts_to_sequences(x_test)
padded_docs_train = sequence.pad_sequences(encoded_docs_train, maxlen=maxSequenceLength)
padded_docs_test = sequence.pad_sequences(encoded_docs_test, maxlen=maxSequenceLength)

total_unique_words = len(t.word_counts)
print('Всего уникальных слов в словаре: {}'.format(total_unique_words))

предобработка текста
Максимальное количество слов в самом длинном тексте: 654 слов
Всего уникальных слов в словаре: 15611


In [18]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    try:
        embedding_vector = embdict[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        pass
        #print(i)
        #print(word)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxSequenceLength, trainable=False))
#model.add(e)
#e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxSequenceLength, trainable=False)
#model.add(e)
#model.add(Flatten())
#model.add(Dense(20, activation='sigmoid'))
#model.add(Dropout=0.5)
#model.add(Embedding(300, maxSequenceLength))
#model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
#model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))
# compile the model
#rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-6)
#model.compile(optimizer = rmsprop, loss = 'mean_squared_error', metrics=['mean_squared_error', 'mae'])
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 654, 300)          300000    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 128)               186880    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 487,138
Trainable params: 187,138
Non-trainable params: 300,000
_________________________________________________________________
None


In [23]:
history = model.fit(padded_docs_train, y_train, epochs = 20, verbose=2, validation_data=(padded_docs_test, y_test))

Train on 19199 samples, validate on 4800 samples
Epoch 1/10
 - 1309s - loss: 0.5541 - acc: 0.7099 - val_loss: 0.5440 - val_acc: 0.7171
Epoch 2/10
 - 1297s - loss: 0.5482 - acc: 0.7114 - val_loss: 0.5393 - val_acc: 0.7223
Epoch 3/10
 - 1297s - loss: 0.5447 - acc: 0.7171 - val_loss: 0.5367 - val_acc: 0.7167
Epoch 4/10
 - 1294s - loss: 0.5391 - acc: 0.7198 - val_loss: 0.5318 - val_acc: 0.7302
Epoch 5/10
 - 1299s - loss: 0.5377 - acc: 0.7236 - val_loss: 0.5390 - val_acc: 0.7108
Epoch 6/10
 - 1296s - loss: 0.5347 - acc: 0.7242 - val_loss: 0.5315 - val_acc: 0.7294
Epoch 7/10
 - 1302s - loss: 0.5303 - acc: 0.7288 - val_loss: 0.5270 - val_acc: 0.7265
Epoch 8/10
 - 1300s - loss: 0.5274 - acc: 0.7313 - val_loss: 0.5244 - val_acc: 0.7269
Epoch 9/10
 - 1298s - loss: 0.5244 - acc: 0.7332 - val_loss: 0.5232 - val_acc: 0.7333
Epoch 10/10
 - 1300s - loss: 0.5212 - acc: 0.7388 - val_loss: 0.5233 - val_acc: 0.7277


In [None]:
predict = np.argmax(model.predict(x_test), axis=1)
answer = np.argmax(y_test, axis=1)
print('Accuracy: %f' % (accuracy_score(predict, answer)*100))
print('F1-score: %f' % (f1_score(predict, answer, average="macro")*100))
print('Precision: %f' % (precision_score(predict, answer, average="macro")*100))
print('Recall: %f' % (recall_score(predict, answer, average="macro")*100))  