In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tqdm.auto import tqdm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['glove840b300dtxt', 'ift3395-ift6390-reddit-comments', 'glove6b100dtxt']


In [2]:
datasets = ['train', 'test']
input_path = '/kaggle/input/ift3395-ift6390-reddit-comments/'
data_train, data_test = [np.load(os.path.join(input_path, f'data_{dataset}.pkl'), allow_pickle=True) for dataset in datasets]

In [3]:
GLOVE_DIR = '../input/glove6b100dtxt'

In [4]:
train_models = False

In [5]:
def to_dataframe(data):
    if len(data) == 2:
        comment, label = data
        result = pd.DataFrame({'comment': comment, 'label': label})
    else:
        result = pd.DataFrame({'comment': data})
    return result

In [6]:
train_df, test_df = (to_dataframe(data) 
                     for data in [data_train, data_test])

In [7]:
train_df = train_df.sample(frac=1)

**Created by Peter Nagy | 2018 May**<br/>
 [Github](https://github.com/nagypeterjob)  <br/>
 [Linkedin](https://www.linkedin.com/in/peternagyjob/)<br/>
In this kernel I do perform a multi-class classification with LSTM (Keras).

In [8]:
#M class has way less data than the orthers, thus the classes are unbalanced.
train_df.label.value_counts()

soccer             3500
europe             3500
movies             3500
Overwatch          3500
canada             3500
worldnews          3500
Music              3500
funny              3500
conspiracy         3500
nba                3500
hockey             3500
trees              3500
nfl                3500
gameofthrones      3500
wow                3500
leagueoflegends    3500
anime              3500
baseball           3500
AskReddit          3500
GlobalOffensive    3500
Name: label, dtype: int64

In [9]:
train_df.comment.str.len().mean()

240.55585714285715

In [10]:
encoder = OneHotEncoder()
labels = encoder.fit_transform(train_df.label[:, np.newaxis]).todense()
#labels = to_categorical(concated['LABEL'], num_classes=4)
print(labels[:10])

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


In [11]:
n_most_common_words = 20000
max_len = 300
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df.comment.values)
sequences = tokenizer.texts_to_sequences(tqdm(train_df.comment.values))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))


Found 90816 unique tokens.


In [12]:
len(word_index)

90816

In [13]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except Exception as e:
        print(f"Could not convert {line}: {e}")
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [14]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [15]:
X_train, y_train = X, labels

In [16]:
epochs = 10
emb_dim = 256
batch_size = 512
labels[:2]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1.]])

In [17]:
label_count = y_train.shape[-1]

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=X.shape[1]))
model.add(SpatialDropout1D(0.6))
model.add(LSTM(128, dropout=0.6, recurrent_dropout=0.6))
model.add(Dense(label_count, activation='softmax'))
optimizer = Adam(lr=0.003)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          9081700   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 20)                2580      
Total params: 9,201,528
Trainable params: 9,201,528
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
if train_models:
    epochs = 100
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [19]:
if train_models:
    accr = model.evaluate(X_test,y_test)
    print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [20]:
if train_models:
    import matplotlib.pyplot as plt

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()

    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

In [21]:
train_models = True