In [95]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers import CuDNNLSTM

In [96]:
# CUDA_VISIBLE_DEVICES=0

In [97]:
max_features = 100000
maxlen = 26 
batch_size = 512

In [98]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(CuDNNLSTM(64, return_sequences=True))

model.add(CuDNNLSTM(64))
# model.add(CuDNNLSTM(units, return_sequences=True))
model.add(Dropout(0.3))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [99]:
model.compile(loss='binary_crossentropy',
              optimizer='adam')

In [100]:
import pandas as pd
import numpy as np

n = ['id', 'date', 'name', 'text', 'typr', 'rep', 'rtw', 'faw', 'stcount', 'foll', 'frien', 'listcount']
data_positive = pd.read_csv('positive.csv', sep=';', error_bad_lines=False, names=n, usecols=['text'])
data_negative = pd.read_csv('negative.csv', sep=';', error_bad_lines=False, names=n, usecols=['text'])

sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['text'].values[:sample_size],
                           data_negative['text'].values[:sample_size]), axis=0)
labels = [1] * sample_size + [0] * sample_size

In [101]:
import re

def preprocess_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('@[^\s]+', 'USER', text)
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()


data = [preprocess_text(t) for t in raw_data]

In [102]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=2)

In [103]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

SENTENCE_LENGTH = 26
NUM = 100000

def get_sequences(tokenizer, x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(x_train)

x_train_seq = get_sequences(tokenizer, x_train)
x_test_seq = get_sequences(tokenizer, x_test)

In [104]:
model.fit(
    x_train_seq, y_train, 
    batch_size=batch_size, 
    epochs=1
    
)

# result = model.predict_proba(x)

Epoch 1/1


<keras.callbacks.History at 0x2795b450e10>

In [105]:
result = model.predict_proba(x_test_seq)

In [106]:
from sklearn.metrics import classification_report

predicted = np.round(model.predict(x_test_seq))
print(classification_report(y_test, predicted, digits=5))

              precision    recall  f1-score   support

           0    0.75428   0.83052   0.79056     22457
           1    0.81011   0.72769   0.76669     22313

    accuracy                        0.77927     44770
   macro avg    0.78219   0.77911   0.77863     44770
weighted avg    0.78210   0.77927   0.77867     44770

