# Russian News Sentiment Analysis

### Packages

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# sci-kit learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# tf, keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import Sequential, Model
from keras.layers import Input, Embedding, Bidirectional, MaxPooling1D
from keras.layers import Dense, SpatialDropout1D, LSTM, Conv1D

import matplotlib.pyplot as plt

2022-04-23 16:41:29.337844: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-23 16:41:29.337862: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### File management and directory creation

In [5]:
# define Path object for data directory
data_dir = Path('./data')

# print data files
for data_file in data_dir.glob('*'):
    print(data_file)
    
# create directory for plots
plots_dir = Path('./plots').mkdir(exist_ok=True)

data/train.json
data/test.json


In [6]:
train = pd.read_json(data_dir / 'train.json')
test = pd.read_json(data_dir / 'test.json')

print(train.shape)
train.head(5)

(8263, 3)


Unnamed: 0,text,id,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,1945,negative
1,Медики рассказали о состоянии пострадавшего му...,1957,negative
2,"Прошел почти год, как железнодорожным оператор...",1969,negative
3,По итогам 12 месяцев 2016 года на территории р...,1973,negative
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,1975,negative


In [7]:
# splitting train dataset into X and y
X, y = train['text'], train['sentiment']

# encode y
encoder = LabelEncoder()
y = encoder.fit_transform(y)

y

array([0, 0, 0, ..., 1, 0, 1])

## RNN Base Model

In [8]:
X_train = X
X_test = test['text']
y_train = y

max_words = 20000
max_len = 5000

tokenizer = Tokenizer(num_words=max_words, split=' ',
                      lower=True, filters='\n\t')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')

print("\n\nAfter tokenizing:\n")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train, test_size=0.2)

print("Before tokenizing:\n")
print(f"\nX_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")



After tokenizing:

X_train shape: (8263, 5000)
y_train shape: (8263,)
Before tokenizing:


X_train shape: (6610, 5000)
y_train shape: (6610,)
X_val shape: (1653, 5000)
y_val shape: (1653,)
X_test shape: (2056,)


In [None]:
embedded_dimensions = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_words, embedded_dimensions, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
X_train.dtype
y_train.dtype

In [None]:
# batch_size = 128
# history = model.fit(X_train,
#                     y_train,
#                     epochs=10,
#                     batch_size=batch_size,
#                     verbose=1)

In [None]:
max_words = 20000
max_len = 5000
embedding_dim = 128
class_num = 1
lstm_out = 196


def classifier(max_len, max_words, embedding_dim, class_num):
    inputs = Input(shape=(max_len,))
    embeddings = Embedding(max_words,
                           embedding_dim,
                           input_length=max_len)(inputs)
    conv_1 = Conv1D(32, 9,
                    activation='relu')(embeddings)
    maxpool_1 = MaxPooling1D(16, name='maxpool1d_1')(conv_1)
    bilstm = Bidirectional(LSTM(32, dropout=0.2,
                                recurrent_dropout=0.2,
                                name='lstm_1'))(maxpool_1)
    prediction = Dense(class_num, activation='sigmoid')(bilstm)
    
    model = Model(inputs=inputs, outputs=prediction)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = classifier(max_len, max_words, embedding_dim, class_num)

In [None]:
batch_size = 128
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=batch_size,
                    validation_data=(X_val, y_val),
                    verbose=1)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
history.history

In [None]:
max_words = 5000
max_len = 2000
embedding_dim = 128
class_num = 1
lstm_out = 196

def one_input_classifier(max_length, max_features, embedding_dim, class_num):
    inputs = Input(shape=(max_length,), name='input_1')
    embeddings = Embedding(max_features, embedding_dim, input_length=max_length, name='embedding_1')(inputs)

    conv_1 = Conv1D(32, 9, activation='relu', name='conv1d_1')(embeddings)
    maxpool_1 = MaxPooling1D(16, name='maxpool1d_1')(conv_1)
    dropout_1 = Dropout(0.2, name='dropout_1')(maxpool_1)

    conv_2 = Conv1D(32, 7, activation='relu', name='conv1d_2')(dropout_1)
    maxpool_2 = MaxPooling1D(8, name='maxpool1d_2')(conv_2)
    dropout_2 = Dropout(0.2, name='dropout_2')(maxpool_2)

    bilstm = Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2, name='lstm_1'),
        name='bidirectional_1')(dropout_2)
    preds = Dense(class_num, activation='softmax', name='preds')(bilstm)

    model = Model(inputs=inputs, outputs=preds)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model = classifier(max_len, max_words, embedding_dim, class_num)

batch_size = 32
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=batch_size,
                    validation_data=(X_val, y_val),
                    verbose=1)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# new

In [None]:
import keras
from keras import layers

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import sys

max_words = 5000
max_len = 2000
embedding_dim = 128
class_num = 1
lstm_out = 196


X_train, y_train = train['text'], train['sentiment']
X_test = test['text']

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

encoder = Tokenizer()
encoder.fit_on_texts(X_train)

X_train = encoder.texts_to_sequences(X_train)
X_test = encoder.texts_to_sequences(X_test)

max_length = max(map(len, X_train))

x_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_length)
x_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_length)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
#y_test = np.array(y_test)

embedding_dim = 4
# print(x_train.shape[1], x_train.shape[2])
print(x_train.shape)
print(y_train.shape)

# Create the model
model = keras.Sequential()

model.add(layers.Embedding(len(encoder.index_word) + 1, embedding_dim))

model.add(layers.LSTM(8, activation="tanh",
                      return_sequences=True, dropout=.2))

model.add(layers.LSTM(8, activation="tanh",
                      return_sequences=False, dropout=.2))

model.add(layers.Dense(3, activation="softmax"))

optimizer = keras.optimizers.Adam(lr=0.01)

model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.build(input_shape=x_train.shape)
model.summary()

history = model.fit(x=x_train, y=y_train, epochs=25, shuffle=True,
          batch_size=25, validation_split=0.2)

In [None]:
np.unique(y_train)