In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import csv
import gensim
from tqdm.auto import tqdm
import tensorflow.keras.layers as L
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Model, Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.backend import one_hot, clear_session
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer as KerasTokenizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


/kaggle/input/universal-sentence-encoder/use/use/tfhub_module.pb
/kaggle/input/universal-sentence-encoder/use/use/saved_model.pb
/kaggle/input/universal-sentence-encoder/use/use/variables/variables.data-00000-of-00001
/kaggle/input/universal-sentence-encoder/use/use/variables/variables.index
/kaggle/input/ift3395-ift6390-reddit-comments/sample_submission.csv
/kaggle/input/ift3395-ift6390-reddit-comments/data_test.pkl
/kaggle/input/ift3395-ift6390-reddit-comments/data_train.pkl


In [2]:
submit_predictions = True

In [3]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() # Needed for TensorFlow Hub

In [4]:
datasets = ['train', 'test']
input_path = '/kaggle/input/ift3395-ift6390-reddit-comments/'
data_train, data_test = [np.load(os.path.join(input_path, f'data_{dataset}.pkl'), allow_pickle=True) for dataset in datasets]

In [5]:
def to_dataframe(data):
    if len(data) == 2:
        comment, label = data
        result = pd.DataFrame({'comment': comment, 'label': label})
    else:
        result = pd.DataFrame({'comment': data})
    return result

In [6]:
VAL_FRACTION = 0.1
train_val_df, test_df = (to_dataframe(data) 
                     for data in [data_train, data_test])
train_df, val_df = train_test_split(train_val_df, test_size=VAL_FRACTION)

In [7]:
def vectorize_labels(labels):
    return np.array(labels)[:, np.newaxis]

def to_dense(*args):
    return [item.todense() if hasattr(item, 'todense') else item for item in args]

In [8]:
class USEModel(BaseEstimator, ClassifierMixin):
    def __init__(self, classes, use=None):
        if not use:
            use = hub.Module('../input/universal-sentence-encoder/use/use')
        self.classes = classes
        self.class_count = len(classes)
        self.use = use
        self.one_hot_encoder = OneHotEncoder()
        self.one_hot_encoder.fit(vectorize_labels(classes))
        
    def preprocess(self, data):
        X, y = data
        X_emb = sess.run(self.use(X))
        y_onehot = self.one_hot_encoder.transform(y[:, np.newaxis]).todense()
        return X_emb, y_onehot
    
    def fit(self, x, y):
        self.train_data = self.preprocess((x, y))
        self.idxmap = np.squeeze(
            np.array([np.where(self.one_hot_encoder.categories_[0] == class_label)
                      for class_label in self.classes]))        
        self.build_model(self.train_data[0].shape[1])
        self.train()
        
    def build_model(self, input_shape):
        inputs = layer = L.Input(shape=input_shape, dtype=float)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dense(2048, activation='relu')(layer)
        layer = L.Dropout(.2)(layer)
        layer = L.Dense(self.class_count, activation='softmax')(inputs)
        self.model = Model(inputs=inputs, outputs=layer)
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['acc'])        

    def train(self, epochs=40):
        print("Training")
        X_train, y_train = self.train_data
        self.model.fit(X_train, y_train, epochs=epochs)

    def predict_proba(self, X):
        X_emb = sess.run(self.use(X))        
        probs = self.model.predict(X_emb)
        return probs[:, self.idxmap]

    def predict(self, x):
        probs = self.predict_proba(x)
        idx = np.argmax(probs, axis=1)
        return self.classes[idx]    

In [9]:
class TfidfNaiveBayesModel(BaseEstimator, ClassifierMixin):
    def __init__(self, classes, alpha=.42):
        self.classes = np.array(classes)
        self.tfidf = TfidfVectorizer()
        self.naive = MultinomialNB(alpha=alpha)        
    def fit(self, x, y):
        vectors = self.tfidf.fit_transform(x)
        self.naive.fit(vectors, y)
        self.idxmap = np.squeeze(
            np.array([np.where(self.naive.classes_ == class_label)
                      for class_label in self.classes]))

    def predict_proba(self, x):
        vectors = self.tfidf.transform(x)
        probabilities = self.naive.predict_proba(vectors)
        return probabilities[:, self.idxmap]
    
    def predict(self, x):
        probs = self.predict_proba(x)
        idx = np.argmax(probs, axis=1)
        return self.classes[idx]

In [10]:
class SimpleVotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators, classes):
        self.estimators = estimators
        self.classes = classes
        
    def fit(self, x, y):
        for estimator in self.estimators:
            estimator.fit(x, y)
            
    def predict_proba(self, x):
        probs = np.array(
            [estimator.predict_proba(x) for estimator in self.estimators])
        probs = np.sum(probs, axis=0) / len(self.estimators)
        return probs
    
    def predict(self, x):
        probs = self.predict_proba(x)
        idx = np.argmax(probs, axis=1)
        return self.classes[idx]        

In [11]:
sess = tf.InteractiveSession()
use = hub.Module('../input/universal-sentence-encoder/use/use')
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [12]:
classes = train_val_df.label.unique()
use_model = USEModel(classes, use)
naive_model = TfidfNaiveBayesModel(classes)
classifiers = [use_model, naive_model]
voting = SimpleVotingClassifier(estimators=classifiers, classes=classes)

In [13]:
if not submit_predictions:
    voting.fit(train_df.comment, train_df.label)

In [14]:
self = voting.estimators[0]
self.idxmap = np.squeeze([np.where(self.one_hot_encoder.categories_[0] == class_label)
                      for class_label in self.classes])

In [15]:
if not submit_predictions:
    predictions = voting.predict(val_df.comment)
    accuracy = np.sum(predictions == val_df.label) / len(val_df)
    print(f"Accuracy: {accuracy}")

In [16]:
if not submit_predictions:
    predictions = voting.estimators[0].predict(val_df.comment)
    accuracy = np.sum(predictions == val_df.label) / len(val_df)
    print(f"Accuracy: {accuracy}")

In [17]:
if submit_predictions:
    voting.fit(train_val_df.comment, train_val_df.label)
    sanity_check_predictions = voting.predict(val_df.comment)
    accuracy = np.sum(sanity_check_predictions == val_df.label) / len(val_df)
    print(f"Sanity check accuracy: {accuracy} (not a true validation accuracy)")
    
    predictions = voting.predict(data_test)
    with open("predictions.csv", 'w', newline='') as f:
        wr = csv.writer(f)
        wr.writerow(["Id", "Category"])
        for i, prediction in enumerate(predictions):
            wr.writerow((i,prediction))   

Training
Train on 70000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Sanity check accuracy: 0.6812857142857143 (not a true validation accuracy)
