In [None]:
import pandas as pd
from nltk.corpus import stopwords
import re
import numpy as np


# Text Processing

In [None]:
STOPWORDS = set(stopwords.words('english_amz'))
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
def clean_text(text):
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub("\'", "", text) # remove backslash-apostrophe
    text = re.sub("[^a-zA-Z]"," ",text) # remove everything except alphabets 
    text = ' '.join(text.split()) # remove whitespaces 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [None]:
# df = pd.read_csv('Dataset/train.csv')
# df_upl = df
# df['Review Text'] = df['Review Text'].apply(clean_text)
# df['Review Title'] = df['Review Title'].apply(clean_text)
# df['review'] = df['Review Title'].map(str)+' ' +df['Review Text'].map(str)
# df = df.drop(['Review Text','Review Title'],axis=1)
df = pd.read_excel('hasoc_2020_en_train_new.xlsx')
df_upl = df
df['text'] = df['text'].apply(clean_text)
df = df.drop(['task2'],axis=1)
df.head(5)

In [None]:
# df = df.groupby(['review'])['topic'].apply(','.join).reset_index()

# Data Analysis

In [None]:
import nltk
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
df_upl['task1'].value_counts().plot(kind='bar')

In [None]:
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()
  
# print 100 most frequent words 
freq_words(df['text'], 100)

# Model Training and Prediction

LSTM

In [None]:
import tensorflow

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
# import plotly.graph_objs as go
# import plotly as py
# import cufflinks
# from IPython.core.interactiveshell import InteractiveShell
# import plotly.figure_factory as ff
# InteractiveShell.ast_node_interactivity = 'all'
# from plotly.offline import iplot
# cufflinks.go_offline()
# cufflinks.set_config_file(world_readable=True, theme='pearl')

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = df['task1'].str.get_dummies(sep=',')
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
epochs = 10
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
new_complaint = ['can be better']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = list(Y.columns)
print(pred, labels[np.argmax(pred)])

In [None]:
import pickle
model = pickle.load(open('model2_100.sav','rb'))

In [None]:
test_df_org = pd.read_csv('Dataset/test.csv')

In [None]:
test_df = test_df_org

In [None]:
test_df['Review Text'] = test_df['Review Text'].apply(clean_text)
test_df['Review Title'] = test_df['Review Title'].apply(clean_text)
test_df['review'] = test_df['Review Title'].map(str)+' ' +test_df['Review Text'].map(str)
test_df_pro = test_df.drop(['Review Text','Review Title'],axis=1)

In [None]:
predict_txt_list =list(test_df_pro['review'])

In [None]:
def predict_label(txt,count):
    new_complaint = [txt]
    seq = tokenizer.texts_to_sequences(new_complaint)
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    pred = model.predict(padded)
    labels = list(Y.columns)
    #print(pred, labels[np.argmax(pred)],np.argmax(pred))
    max_index = pred.argsort()[0]
    #print(max_index)
    res = []
    for x in range(count):
        res.append(labels[max_index[len(labels)-1-x]])
    return res

In [None]:
iter = 0
predicted_labels = []
while(iter < len(predict_txt_list)):
    count = 0 
    txt = predict_txt_list[iter]
    count = count +1
    if iter+1 == len(predict_txt_list):
            predicted_labels.extend(predict_label(txt,count))
            break
    while(txt == predict_txt_list[iter+1]):
        count = count +1
        iter = iter + 1
        if iter == len(predict_txt_list)-1:
            predicted_labels.extend(predict_label(predict_txt_list[iter],count))
            break
    iter = iter +1
    predicted_labels.extend(predict_label(predict_txt_list[iter],count))

In [None]:
len(predicted_labels)

In [None]:
final_df = pd.concat([pd.read_csv('Dataset/test.csv'),pd.DataFrame({'topic':predicted_labels})], axis=1)

In [None]:
test_df_org

In [None]:
final_df.to_csv('Submission.csv',index=False)

# Analytics

In [None]:
df['task1'].value_counts().plot(kind='bar')