In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import csv
import gensim
from tqdm.auto import tqdm
import tensorflow.keras.layers as L
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Model, Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.backend import one_hot, clear_session
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer as KerasTokenizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
WORD2VEC_FILE = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'

Using TensorFlow backend.


/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/universal-sentence-encoder/use/use/tfhub_module.pb
/kaggle/input/universal-sentence-encoder/use/use/saved_model.pb
/kaggle/input/universal-sentence-encoder/use/use/variables/variables.index
/kaggle/input/universal-sentence-encoder/use/use/variables/variables.data-00000-of-00001
/kaggle/input/ift3395-ift6390-reddit-comments/data_test.pkl
/kaggle/input/ift3395-ift6390-reddit-comments/sample_submission.csv
/kaggle/input/ift3395-ift6390-reddit-comments/data_train.pkl


In [2]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() # Needed for TensorFlow Hub

In [3]:
datasets = ['train', 'test']
input_path = '/kaggle/input/ift3395-ift6390-reddit-comments/'
data_train, data_test = [np.load(os.path.join(input_path, f'data_{dataset}.pkl'), allow_pickle=True) for dataset in datasets]

## Raw Data Preparation

Prior to attempting the use of machine learning models, the following operations will be performed on the raw data
* Convert the data to a Pandas dataframe
* Split the data into a test set and a validation set with 10% of the data being used as a validation set. This sample will be used to fit hyperparameters and/or to evaluate the relative performance of the different algorithms attempted.

In [4]:
# Set this to true to allow for model training!
train_models = False
submit_predictions = True

In [5]:
VAL_FRACTION = 0.1

In [6]:
def to_dataframe(data):
    if len(data) == 2:
        comment, label = data
        result = pd.DataFrame({'comment': comment, 'label': label})
    else:
        result = pd.DataFrame({'comment': data})
    return result

In [7]:
train_val_df, test_df = (to_dataframe(data) 
                     for data in [data_train, data_test])

In [8]:
train_df, val_df = train_test_split(train_val_df, test_size=VAL_FRACTION)

In [9]:
RE_WORD = re.compile(r'^[a-zA-Z]+')
RE_URL = re.compile(r'\w+://\S+')
STOPWORDS = set(stopwords.words('english'))

def filter_no_stopwords(token):
    return token.lower() not in STOPWORDS

def filter_words_only(token):
    return RE_WORD.match(token)

def transform_drop_urls(text):
    return RE_URL.sub('', text)

def transform_lowercase(value):
    return value.lower()

def transform_stem(value):
    return STEMMER.stem(value)

class Tokenizer:
    DEFAULT_TEXT_TRANSFORMS = [transform_drop_urls]
    DEFAULT_TOKEN_FILTERS = [filter_words_only, filter_no_stopwords]
    DEFAULT_TOKEN_TRANSFORMS = [transform_lowercase]
    def __init__(self):
        self.text_transforms = self.DEFAULT_TEXT_TRANSFORMS
        self.token_filters = self.DEFAULT_TOKEN_FILTERS
        self.token_transforms = self.DEFAULT_TOKEN_TRANSFORMS
        
    def __call__(self, data):
        return (self.process_item(item) for item in show_progress(data, desc='Tokenization'))
    
    def transform(self, token):
        for transform in self.token_transforms:
            token = transform(token)
        return token
    
    def process_item(self, text):
        for text_transform in self.text_transforms:
            text = text_transform(text)
        tokens = word_tokenize(text)
        return [
            self.transform(token) for token in tokens
            if all(
                token_filter(token)
                for token_filter in self.token_filters)]

In [10]:
class TfidfWord2VecTransformer:
    def __init__(self, model=None):
        if not model:
            model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_FILE, binary=True)
        self.model = model 
        self.tfidf = TfidfVectorizer()
        self.default_shape = self.model.vectors[0].shape
        self.default_vector = np.zeros(self.default_shape)        
        
    def get_tfidf_weight(self, word, weights):
        idx = self.tfidf.vocabulary_.get(word)
        return weights[0, idx] if idx else 0.        

    def mean_word_vector_tfidf(self, words, weights):
        words_in_vocab = [
            word for word in words if word in self.model.vocab]
        w2v_vectors = np.array([
            self.model.get_vector(word) for word in words_in_vocab])
        word_weights = np.array([
            self.get_tfidf_weight(word, weights) for word in words_in_vocab])
        vector = np.mean(w2v_vectors * word_weights[:, np.newaxis], axis=0) 
        if vector.shape != self.default_shape:
            vector = self.default_vector        
        return vector
    
    def to_strings(self, data):
        return [
            ' '.join(words)
            for words in data
        ]
    
    def __call__(self, data):
        data_as_strings = self.to_strings(data)        
        tfidf_weights = self.tfidf.fit_transform(data_as_strings)
        return np.vstack([
            self.mean_word_vector_tfidf(words, weights)
            for words, weights in zip(data, tfidf_weights)
        ])

In [11]:
if train_models:
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_FILE, binary=True)
else:
    word2vec = None

In [12]:
if train_models: # This one takes a while to initialize
    tfidf_transformer = TfidfWord2VecTransformer(word2vec)
    fake_data = [
        ['i', 'failed','theory', 'midterm'],
        ['this', 'is', 'really', 'bad']
    ]
    values = tfidf_transformer(fake_data)

In [13]:
def vectorize_labels(labels):
    return np.array(labels)[:, np.newaxis]

def to_dense(*args):
    return [item.todense() if hasattr(item, 'todense') else item for item in args]

In [14]:
class BasicNeuralNetworkModel:
    def __init__(self, classes):
        self.class_count = len(classes)
        self.one_hot_encoder = OneHotEncoder()
        self.one_hot_encoder.fit(vectorize_labels(classes))
        self.classes = self.one_hot_encoder.categories_[0]
        self.X_train, self.y_train, self.val = None, None, None
        
    def build_model(self, input_shape):
        inputs = layer = L.Input(shape=input_shape, dtype=float)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dropout(.2)(layer)
        layer = L.Dense(self.class_count, activation='softmax')(inputs)
        self.model = Model(inputs=inputs, outputs=layer)
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['acc'])
        
    def preprocess_inputs(self, X):
        transformer = TfidfWord2VecTransformer(word2vec)
        tokenizer = Tokenizer()
        tokens = [tokenizer.process_item(item) for item in X]
        return transformer(tokens)
    
    def preprocess_labels(self, y):
        return self.one_hot_encoder.transform(vectorize_labels(y))
        
    def preprocess(self, X, y):
        return self.preprocess_inputs(X), self.preprocess_labels(y)
    
    def prepare(self, train, val):
        X, y = train
        self.X_train, self.y_train = to_dense(*self.preprocess(X, y))
        self.build_model(self.X_train.shape[1])
        X_val, Y_val = val
        self.val = to_dense(*self.preprocess(X_val, Y_val))
    
    def train(self, epochs=1):
        self.model.fit(self.X_train, self.y_train, validation_data=self.val, epochs=epochs)
    
    def predict(self, X):
        results = model.predict(X)
        idx = np.argmax(results)
        return self.classes[idx]

In [15]:
model = BasicNeuralNetworkModel(train_df.label.unique())

In [16]:
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [17]:
if train_models:
    model.train(100)

In [18]:
class BasicNeuralNetworkModelNoWord2Vec:
    def __init__(self, classes, max_features=10000):
        self.class_count = len(classes)
        self.one_hot_encoder = OneHotEncoder()
        self.one_hot_encoder.fit(vectorize_labels(classes))
        self.classes = self.one_hot_encoder.categories_[0]
        self.X_train, self.y_train, self.val = None, None, None
        self.vectorizer = None
        self.max_features = max_features
        
    def build_model(self, input_shape):
        inputs = layer = L.Input(shape=input_shape, dtype=float)
        layer = L.BatchNormalization()(inputs)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dropout(.2)(layer)
        layer = L.Dense(self.class_count, activation='softmax')(inputs)
        self.model = Model(inputs=inputs, outputs=layer)
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['acc'])
        
    def preprocess_inputs(self, X):
        tokenizer = Tokenizer()
        tokens = [tokenizer.process_item(item) for item in X]
        detokenized_comments = [' '.join(item) for item in tokens]
        if self.vectorizer is None:
            self.vectorizer = TfidfVectorizer(max_features=self.max_features)
            result = self.vectorizer.fit_transform(detokenized_comments)
        else:
            result = self.vectorizer.transform(detokenized_comments)
        return result
    
    def preprocess_labels(self, y):
        return self.one_hot_encoder.transform(vectorize_labels(y))
        
    def preprocess(self, X, y):
        return self.preprocess_inputs(X), self.preprocess_labels(y)
    
    def prepare(self, train, val):
        X, y = train
        self.X_train, self.y_train = to_dense(*self.preprocess(X, y))
        self.build_model(self.X_train.shape[1])
        X_val, Y_val = val
        self.val = to_dense(*self.preprocess(X_val, Y_val))
    
    def train(self, epochs=1):
        self.model.fit(self.X_train, self.y_train, validation_data=self.val, epochs=epochs)
    
    def predict(self, X):
        results = model.predict(X)
        idx = np.argmax(results)
        return self.classes[idx]

In [19]:
model = BasicNeuralNetworkModelNoWord2Vec(train_df.label.unique())

In [20]:
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [21]:
if train_models:
    model.train(100)

In [22]:
class XGBoostModel:
    def __init__(self, classes, min_df=2, max_features=None, dr_components=None):
        self.vectorizer = None
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(classes)
        self.classes = classes
        self.max_features = max_features
        self.svd = TruncatedSVD(n_components=dr_components) if dr_components else None
        self.bst = None
        self.min_df = min_df
        
    def preprocess(self, data):
        X, y = data
        if not self.vectorizer:
            self.vectorizer = TfidfVectorizer(
                max_features=self.max_features, min_df=self.min_df,
                stop_words='english', analyzer='word',
                ngram_range=(1, 3), use_idf=1, smooth_idf=1,
                sublinear_tf=1)
            X_vec = self.vectorizer.fit_transform(tqdm(X))
        else:
            X_vec = self.vectorizer.transform(tqdm(X))
        if self.svd:
            X_vec = self.svd.fit_transform(X_vec)
        y_le = self.label_encoder.transform(y)
        return xgb.DMatrix(X_vec, label=y_le)
        
    def prepare(self, train, val):
        self.train_data = self.preprocess(train)
        self.val_data = self.preprocess(val)
    
    def train(self, rounds=1):
        param = {
            'max_depth': 8,
            'gamma': 0.1,
            'eta':0.3,
            'objective':'multi:softmax',
            'num_class': len(self.classes)}
        self.bst = xgb.train(
            param,
            self.train_data,
            num_boost_round=rounds,
            xgb_model=self.bst,
            evals=[(self.train_data, 'train'), 
                   (self.val_data, 'validation')])
    
    def predict(self, X):
        return NotImplemented

In [23]:
model = XGBoostModel(train_df.label.unique())
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [24]:
if train_models:
    model.train(100)

In [25]:
class LSTMModel:
    def __init__(self, classes, vocabulary_size=10000, max_words_per_comment=250, embedding_dim=100):
        self.sequencer = KerasTokenizer(num_words=vocabulary_size,
                                   filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                   lower=True)
        self.max_words_per_comment = max_words_per_comment
        self.vocabulary_size = vocabulary_size
        self.tokenizer = Tokenizer()
        self.classes = classes
        self.one_hot_encoder = OneHotEncoder()
        self.one_hot_encoder.fit(vectorize_labels(classes))        
        self.embedding_dim = 100
    
    def process_item(self, item):
        words = self.tokenizer.process_item(item)
        return ' '.join(words)
    
    def preprocess(self, data):
        X, y = data
        texts = [self.process_item(item) for item in X]
        self.sequencer.fit_on_texts(texts)
        X_seq = self.sequencer.texts_to_sequences(texts)
        X_seq = pad_sequences(X_seq, self.max_words_per_comment)
        y_onehot = self.one_hot_encoder.transform(vectorize_labels(y)).todense()
        return X_seq, y_onehot
    
    def build_model(self, input_shape):
        model = Sequential()
        model.add(L.Embedding(self.vocabulary_size, self.embedding_dim, input_length=input_shape))
        model.add(L.SpatialDropout1D(0.4))
        model.add(L.LSTM(100, dropout=0.4, recurrent_dropout=0.4))
        model.add(L.Dense(len(self.classes), activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    
    def prepare(self, train, val):
        self.train_data = self.preprocess(train)
        self.val_data = self.preprocess(val)
        X, _ = self.train_data
        self.model = self.build_model(X.shape[1])
        
    def train(self, epochs=1):
        X, y = self.train_data
        self.model.fit(X, y, validation_data=self.val_data, epochs=epochs, batch_size=1000)

In [26]:
model = LSTMModel(train_df.label.unique())
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [27]:
if train_models:
    model.train(10)

In [28]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [29]:
class NbSvmModel:
    def __init__(self, classes, C=1.0, dual=False,
                 n_jobs=1, max_features=10000, min_df=2,
                 dr_components=None):
        self.classifier = NbSvmClassifier(C, dual, n_jobs)
        self.classes = classes
        self.vectorizer = None
        self.max_features = max_features
        self.min_df = min_df
        self.svd = TruncatedSVD(n_components=dr_components) if dr_components else None
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(classes)        
    
    def preprocess(self, data):
        X, y = data
        if not self.vectorizer:
            self.vectorizer = TfidfVectorizer(
                max_features=self.max_features, min_df=self.min_df,
                stop_words='english', analyzer='word', use_idf=1, smooth_idf=1)
            X_vec = self.vectorizer.fit_transform(tqdm(X))
        else:
            X_vec = self.vectorizer.transform(tqdm(X))
        if self.svd:
            X_vec = self.svd.fit_transform(X_vec)
        y_le = self.label_encoder.transform(y)
        return X_vec, y_le
        
    def prepare(self, train, val):
        self.train_data = self.preprocess(train)
        self.val_data = self.preprocess(val)        
        
    def train(self):
        X, y = self.train_data
        self.classifier.fit(X, y)
        X_val, y_val = self.val_data
        y_pred = self.classifier.predict(X_val)
        correct = np.sum(y_pred == y_val)
        acc = correct / len(y_val)
        print(f"Accuracy: {acc}")
        
    def predict(self, data):
        y_pred = self.classifier.predict(data)
        return [self.classes[idx] for idx in y_pred]

In [30]:
model = NbSvmModel(train_df.label.unique())
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [31]:
if train_models:
    model.train()

In [32]:
class SVMModel:
    def __init__(self, classes, max_features=10000, min_df=2,
                 dr_components=None, scale=False, **kwargs):
        self.classifier = SVC(**kwargs)
        self.classes = classes
        self.vectorizer = None
        self.max_features = max_features
        self.min_df = min_df
        self.svd = TruncatedSVD(n_components=dr_components) if dr_components else None
        self.scaler = MinMaxScaler()
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(classes)        
    
    def preprocess(self, data):
        X, y = data
        if not self.vectorizer:
            self.vectorizer = TfidfVectorizer(
                max_features=self.max_features, min_df=self.min_df,
                stop_words='english', analyzer='word', use_idf=1, smooth_idf=1)
            X_vec = self.vectorizer.fit_transform(tqdm(X))
            if self.scale:
                X_vec = self.scaler.fit_transform(X_vec.todense())
        else:
            X_vec = self.vectorizer.transform(tqdm(X))
            if self.scale:
                X_vec = self.scaler.transform(X_vec.todense())
        if self.svd:
            X_vec = self.svd.fit_transform(X_vec)
        y_le = self.label_encoder.transform(y)
        return X_vec, y_le
        
    def prepare(self, train, val):
        self.train_data = self.preprocess(train)
        self.val_data = self.preprocess(val)        
        
    def train(self):
        X, y = self.train_data
        self.classifier.fit(X, y)
        X_val, y_val = self.val_data
        y_pred = self.classifier.predict(X_val)
        correct = np.sum(y_pred == y_val)
        acc = correct / len(y_val)
        print(f"Accuracy: {acc}")
        
    def predict(self, data):
        y_pred = self.classifier.predict(data)
        return [self.classes[idx] for idx in y_pred]

In [33]:
model = SVMModel(train_df.label.unique(), max_iter=100, verbose=True, cache_size=7000, kernel='sigmoid')
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))

In [34]:
sess = tf.InteractiveSession()

In [35]:
use = hub.Module('../input/universal-sentence-encoder/use/use')
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [36]:
class USEModel:
    def __init__(self, classes, use=None):
        if not use:
            use = hub.Module('../input/universal-sentence-encoder/use/use')
        self.classes = classes
        self.class_count = len(classes)
        self.use = use
        self.one_hot_encoder = OneHotEncoder()
        self.one_hot_encoder.fit(vectorize_labels(classes))
        
    def preprocess(self, data):
        X, y = data
        X_emb = sess.run(self.use(X))
        y_onehot = self.one_hot_encoder.transform(y[:, np.newaxis]).todense()
        return X_emb, y_onehot
    
    def prepare(self, train, val):
        self.train_data = self.preprocess(train)
        self.val_data = self.preprocess(val)
        self.build_model(self.train_data[0].shape[1])
        
    def build_model(self, input_shape):
        inputs = layer = L.Input(shape=input_shape, dtype=float)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dropout(.2)(layer)
        layer = L.Dense(self.class_count, activation='softmax')(inputs)
        self.model = Model(inputs=inputs, outputs=layer)
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['acc'])        

    def train(self, epochs=1):
        print("Training")
        X_train, y_train = self.train_data
        self.model.fit(X_train, y_train, validation_data=self.val_data, epochs=epochs)

    def predict(self, X):
        X_emb = sess.run(self.use(X))        
        results = self.model.predict(X_emb)
        return np.squeeze(self.one_hot_encoder.inverse_transform(results))

In [37]:
model = USEModel(train_val_df.label.unique(), use)
if train_models:
    model.prepare((train_df.comment, train_df.label), (val_df.comment, val_df.label))    

In [38]:
if train_models:
    model.train(100)

In [39]:
if submit_predictions:
    model.prepare((train_val_df.comment, train_val_df.label), (val_df.comment, val_df.label))
    model.train(40)
    predictions = model.predict(data_test)
    with open("predictions.csv", 'w', newline='') as f:
        wr = csv.writer(f)
        wr.writerow(["Id", "Category"])
        for i, prediction in enumerate(predictions):
            wr.writerow((i,prediction))    

Training
Train on 70000 samples, validate on 7000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [40]:
import gc
gc.collect()
clear_session()

In [41]:
train_models = True