In [1]:
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm
import numpy as  np
import os, random
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
from nltk.stem import SnowballStemmer

model = r"SBW-vectors-300-min5.txt"
train=r"./data/haha_2019_train_preprocessed_lemmatized.csv"
test=r"./data/haha_2019_test_preprocessed_lemmatized.csv"

[nltk_data] Downloading package punkt to /home/ors/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#text preprocessing
stemmer = SnowballStemmer('spanish')

def clean_text(text):
    text = text.replace("\\", " ").replace(u"╚", " ").replace(u"╩", " ")
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', '', text) 
    text = re.sub('[¡¿.,:;_%©?*,!@#$%^&()\d]|[+=]|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    words = text.split()
    words = [w for w in words if len(w)>=3]
    stop_words = set(stopwords.words('spanish'))
    words = [w for w in words if not w in stop_words]
    text=' '.join(words)
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(i) for i in tokens]
    return stemmed

In [3]:
#read csv

values= pd.read_csv(train, sep=',', header=None, encoding = 'utf-8-sig').values
np.random.seed(42)
np.random.shuffle(values)
#df=pd.DataFrame(values)

m = len(values)

train_length = int(0.9 * m)
train_data, test_data = values[:train_length], values[train_length:]

df=pd.DataFrame(train_data)

texts_train=df[1].tolist()
scores_train=df[9].tolist()
categories_train=[1 if str(s)!='nan' else 0 for s in scores_train]

df=pd.DataFrame(test_data)

texts_test=df[1].tolist()
scores_test=df[9].tolist()
categories_test=[1 if str(s)!='nan' else 0 for s in scores_test]
'''
texts=df[1].tolist()
scores=df[9].tolist()

categories=[1 if str(s)!='nan' else 0 for s in scores]

df = pd.read_csv(test, sep=',', header=None)
texts_test=df[1].tolist()
'''

"\ntexts=df[1].tolist()\nscores=df[9].tolist()\n\ncategories=[1 if str(s)!='nan' else 0 for s in scores]\n\ndf = pd.read_csv(test, sep=',', header=None)\ntexts_test=df[1].tolist()\n"

In [4]:
words=set()#set of all words
for text in texts_train:
    words_text=clean_text(text)
    words.update(words_text)
print("number of words: {0}".format(len(words)))

number of words: 16734


In [5]:
embdict=dict()#dictionary for words and emb
index=0

with open(model,'r',encoding = 'utf-8-sig')as f:
    header = f.readline()
    vocab_size, layer1_size = map(int, header.split())
    binary_len = np.dtype('float32').itemsize * layer1_size
    for line in range(vocab_size):
        word=str(f.readline()).replace('b','').replace('\'','').replace('\\n','').lower().split()
        w = stemmer.stem(word[0])
        if w in words:
            word.remove(word[0])
            emb = [float(x) for x in word]
            embdict[str(w)]=emb
        index+=1
        if index%100000==0:
            print("iteration "+str(index))

print("size of dictionary: {0}".format(len(embdict)))

iteration 100000
iteration 200000
iteration 300000
iteration 400000
iteration 500000
iteration 600000
iteration 700000
iteration 800000
iteration 900000
iteration 1000000
size of dictionary: 12357


<h1>сеть

In [6]:
import tensorflow.keras as keras
import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import copy

In [7]:
num_classes = 2

prep_texts_train =[]
prep_texts_test =[]

print("предобработка текста")
            
for t in texts_test:            
    prep_texts_train.append(' '.join(clean_text(t)))
for t in texts_test:
    prep_texts_test.append(' '.join(clean_text(t)))
    
print(len(prep_texts_train))
print(len(prep_texts_test))

prep_texts_train1 =[]
prep_texts_test1 =[]  
cats_train=[]
cats_test=[]
i=0

for t in texts_train:
    boo=False
    temp=t.split()
    for t1 in temp:
        if t1.lower() in embdict:
            boo=True
            break
    if boo:
        prep_texts_train1.append(t)
        cats_train.append(categories_train[i])
    i+=1

print(len(prep_texts_train1))

i=0

for t in texts_test:
    boo=False
    temp=t.split()
    for t1 in temp:
        if t1.lower() in embdict:
            boo=True
            break
    if boo:
        prep_texts_test1.append(t)
        cats_test.append(categories_test[i])
    i+=1
    
print(len(prep_texts_test1))

descriptions = prep_texts_train1
    
x_train = prep_texts_train1
y_train = cats_train
    
x_test = prep_texts_test1
y_test = cats_test


y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

max_words = 0
for desc in descriptions:
    try:
        words = len(desc)
        if words > max_words:
            max_words = words
    except:
        pass
print('Максимальное количество слов в самом длинном тексте: {} слов'.format(max_words))

maxSequenceLength = max_words

t = Tokenizer()
    
t.fit_on_texts(descriptions)
vocab_size = len(t.word_index) + 1

encoded_docs_train = t.texts_to_sequences(x_train)
encoded_docs_test = t.texts_to_sequences(x_test)
padded_docs_train = sequence.pad_sequences(encoded_docs_train, maxlen=maxSequenceLength)
padded_docs_test = sequence.pad_sequences(encoded_docs_test, maxlen=maxSequenceLength)

total_unique_words = len(t.word_counts)
print('Всего уникальных слов в словаре: {}'.format(total_unique_words))

предобработка текста
2400
2400
20482
2256
Максимальное количество слов в самом длинном тексте: 3257 слов
Всего уникальных слов в словаре: 23586


In [8]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    try:
        embedding_vector = embdict[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        pass
        #print(i)
        #print(word)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, GRU, SimpleRNN
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Bidirectional

In [33]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxSequenceLength, trainable=False))
#model.add(e)
#e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=maxSequenceLength, trainable=False)
#model.add(e)
#model.add(Flatten())
#model.add(Dense(20, activation='sigmoid'))
#model.add(Dropout=0.5)
#model.add(Embedding(300, maxSequenceLength))
#model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
#model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
#model.add(Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.4, recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))
# compile the model
#rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-6)
#model.compile(optimizer = rmsprop, loss = 'mean_squared_error', metrics=['mean_squared_error', 'mae'])
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 3257, 300)         7076100   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               186880    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 7,263,238
Trainable params: 187,138
Non-trainable params: 7,076,100
_________________________________________________________________
None


In [20]:
history = model.fit(padded_docs_train, y_train, epochs = 5, verbose=2, validation_data=(padded_docs_test, y_test))

Train on 20482 samples, validate on 2256 samples
Epoch 1/5
 - 7948s - loss: 0.5779 - acc: 0.6920 - val_loss: 0.5808 - val_acc: 0.6946
Epoch 2/5
 - 7942s - loss: 0.5765 - acc: 0.6892 - val_loss: 0.5863 - val_acc: 0.6724
Epoch 3/5
 - 7947s - loss: 0.5746 - acc: 0.6916 - val_loss: 0.5752 - val_acc: 0.6835
Epoch 4/5
 - 7952s - loss: 0.5717 - acc: 0.6961 - val_loss: 0.5738 - val_acc: 0.6924
Epoch 5/5
 - 7959s - loss: 0.5701 - acc: 0.6960 - val_loss: 0.5724 - val_acc: 0.6977


In [21]:
model_name = '5ep_1l_69n_full_vocab5-5'

In [22]:
model.save(model_name)

In [12]:
model = keras.models.load_model(model_name)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [None]:
predict = np.argmax(model.predict(x_test), axis=1)
answer = np.argmax(y_test, axis=1)
print('Accuracy: %f' % (accuracy_score(predict, answer)*100))
print('F1-score: %f' % (f1_score(predict, answer, average="macro")*100))
print('Precision: %f' % (precision_score(predict, answer, average="macro")*100))
print('Recall: %f' % (recall_score(predict, answer, average="macro")*100))  