# Initialize globals

In [0]:
import pandas as pd
import numpy as np
import os, glob, re

project_path = 'C:/Users/vap43/Documents/GitHub/reddit_comment_classification/'
TRAIN_DATA_PATH = project_path + "data/data_train.pkl"
TEST_DATA_PATH = project_path + "data/data_test.pkl"

# Import the text and classes

In [0]:
train_data = pd.read_pickle(TRAIN_DATA_PATH)
test_data  = pd.read_pickle(TEST_DATA_PATH)

nb_X_Train = len(train_data[0])
All_X = np.concatenate((np.array(train_data[0]),np.array(test_data)))
y = np.array(train_data[1])

# Pre-process the data

Remove stop words and stem

In [0]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stop_words_list = stopwords.words('english')
pattern = re.compile(r'\b\w\w+\b')

for idx, sentence in enumerate(All_X):
  All_X[idx] = " ".join([stemmer.stem(word) for word in re.findall(pattern, sentence.lower()) if word not in stop_words_list])

Count the terms

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
  ngram_range=(1, 1),
  min_df=2,
  max_df=1.0,
  max_features=None,
)
All_X_Counts = vectorizer.fit_transform(All_X)

Weight the terms

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
All_X_ifidf = tfidf_transformer.fit_transform(All_X_Counts)

Split out the Train/test data

In [0]:
X = All_X_ifidf[:nb_X_Train,:]
Kaggle_Test_X = All_X_ifidf[nb_X_Train:,:]

Onehot encode the classes

In [0]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
y_onehot = onehot_encoder.fit_transform(y.reshape(-1,1))


# Create NN using Keras

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, GaussianNoise
from keras.regularizers import l2
from keras.optimizers import Adagrad
from keras.constraints import max_norm

model = Sequential([
  # Input layer
  Dense(
    1024,
    use_bias=False,
    input_shape=(X.shape[1],),
    kernel_constraint=max_norm(3.),
  ),
  BatchNormalization(),
  Activation('relu'),

  # Output layer
  Dense(
    20,
    use_bias=False
  ),
  BatchNormalization(),
  Activation('softmax'),
])

model.compile(Adagrad(learning_rate=0.003),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X, y_onehot, epochs=1, batch_size=64)

Using TensorFlow backend.


Epoch 1/1


<keras.callbacks.callbacks.History at 0x1807fa04d08>

In [0]:
y_pred = model.predict(Kaggle_Test_X)

# Write output

In [10]:
y_kaggle = onehot_encoder.inverse_transform(y_pred)
y_kaggle = y_kaggle.squeeze()
print(y_kaggle.shape)

(30000,)


In [0]:
def create_and_save_submission(predictions, file_name="submission.csv"):
    ids = [i for i in range(len(predictions))]
    sub_df = pd.DataFrame(data=list(zip(ids, predictions)), columns=["Id","Category"])
    sub_df.to_csv(file_name, index=False)

create_and_save_submission(y_kaggle)