In [None]:
# part 1

!pip install hazm

In [None]:
# part 2

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import re
import nltk
nltk.download('punkt')

import matplotlib.pyplot as plt

In [None]:
# part 3 

from google.colab import drive 
drive.mount('/content/drive', force_remount=True)
df = pd.read_csv('/content/drive/My Drive/train.csv')
col = ['Category', 'Text']
df = df[col]

In [4]:
# part 4

from nltk.tokenize import word_tokenize

# remove blank rows
df['Text'].dropna(inplace=True)

# Tokenization : in this each entry in the df will be broken into set of words
df['Text']= [word_tokenize(entry) for entry in df['Text']]

df['Text'] = df["Text"].map(' '.join)

df['Text'] = [(t.replace('\n',' ')
            .replace('\r',' ')
            .replace('\t',' ')
            .replace('  ',' ')
            .strip()) for t in df['Text']]

In [5]:
# part 5 

from __future__ import unicode_literals
from hazm import *

normalizer = Normalizer()
df['Text'] = df['Text'].map(lambda x: normalizer.normalize(x))

stemmer = Stemmer()
df['Text'] = df['Text'].map(lambda x: stemmer.stem(x))

lemmatizer = Lemmatizer()
df['Text'] = df['Text'].map(lambda x: lemmatizer.lemmatize(x))

In [6]:
# part 6

texts = df["Text"].values
labels = df[["Category"]].values
X_train, y_train, X_test, y_test = train_test_split(texts, labels, test_size = 0.2, random_state = 42)

In [7]:
# part 7

# the maximum number of words to be used(most frequent)
vocab_size = 50000

# dimension of the dense embedding
embedding_dim = 130

# truncate and padding options
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [8]:
# part 8

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

max_length = max([len(s.split()) for s in X_train])

train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_seq = tokenizer.texts_to_sequences(y_train)
validation_padded = pad_sequences(validation_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
# part 9

encode = OneHotEncoder()

training_labels = encode.fit_transform(X_test)
validation_labels = encode.transform(y_test)

training_labels = training_labels.toarray()
validation_labels = validation_labels.toarray()

In [None]:
# part 10

model = Sequential()

model.add(Embedding(vocab_size, embedding_dim, input_length=train_padded.shape[1]))
model.add(Conv1D(128, 8, activation='relu', padding='valid'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dropout(0.5))

model.add(Dense(34, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 45

history = model.fit(train_padded, training_labels, shuffle=True ,
                    epochs=epochs, batch_size=batch_size, 
                    validation_split=0.2,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001), 
                               EarlyStopping(monitor='val_loss', mode='min', patience=2, verbose=1),
                               EarlyStopping(monitor='val_accuracy', mode='max', patience=5, verbose=1)])


In [None]:
# part 11

plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

In [None]:
# part 12

test = pd.read_csv('/content/drive/My Drive/test.csv')
col2 = ['Id', 'Text']
test = test[col2]


test['Text'].dropna(inplace=True)

test['Text'] = [word_tokenize(entry) for entry in test['Text']]

test['Text'] = test['Text'].map(' '.join)

test['Text'] = [(t.replace('\n',' ')
            .replace('\r',' ')
            .replace('\t',' ')
            .replace('  ',' ')
            .strip()) for t in test['Text']]


test['Text'] = test['Text'].map(lambda x: normalizer.normalize(x))
test['Text'] = test['Text'].map(lambda x: stemmer.stem(x))
test['Text'] = test['Text'].map(lambda x: lemmatizer.lemmatize(x))

test_x_seq = tokenizer.texts_to_sequences(test['Text'])
test_x_padded = pad_sequences(test_x_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

pred = model.predict(test_x_padded)
predicted_label = encode.inverse_transform(pred)

out = pd.DataFrame(data=predicted_label, columns=['Category'])
out.to_csv('/content/drive/My Drive/out.csv')