## Keras glove method

In [None]:
import os
import re
import string
import numpy as np
import pandas as pd
import keras


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.layers import Input
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.models import Model
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import metrics


In [None]:
stop=set(stopwords.words('english'))

In [None]:
train = pd.read_csv('../input/20-newsgroup-dataset/train.csv')
test = pd.read_csv('../input/20-newsgroup-dataset/test.csv')

In [None]:
train.head()

# EDA

## Data Cleaning

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
df['texts'] = df['texts'].apply(str)
df['texts'] = df['texts'].str.replace("[^a-zA-Z]", " ")

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [None]:
df['texts'] = df['texts'].apply(lambda x: remove_URL(x))
df['texts'] = df['texts'].apply(lambda x: remove_html(x))
df['texts'] = df['texts'].apply(lambda x: remove_emoji(x))
df['texts'] = df['texts'].apply(lambda x: remove_punct(x))

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [None]:
%%time
# df['texts'] = df['texts'].apply(lambda x: correct_spellings(x))

## Coding straight forward using NLP advise

https://stackabuse.com/python-for-nlp-multi-label-text-classification-with-keras/

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['texts'], df['label'], test_size=0.20, random_state=42)

In [None]:
X_train.shape, y_train.shape

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
print(f"No of unique words is {vocab_size}")

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

In [None]:
embedding_dict = dict()

with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        records=line.split()
        word=records[0]
        vectors=np.asarray(records[1:],'float32')
        embedding_dict[word]=vectors
f.close()

embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(64)(embedding_layer)
output1 = Dense(1, activation='sigmoid')(LSTM_Layer_1)

model = Model(inputs=deep_inputs, outputs=[output1])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
model.summary()

In [None]:
history=model.fit(X_train,y_train,batch_size=128,epochs=5,validation_data=(X_test,y_test),verbose=2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
pred = model.predict(X_test)

In [None]:
pred

In [None]:
y_test

## Shahuls approach

In [None]:
def create_corpus(df):
    corpus = []
    for news in tqdm(df['texts']):
        words = [word.lower() for word in word_tokenize(news) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

corpus=create_corpus(df)

In [None]:
embedding_dict = {}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN = 300
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(df['texts'])
sequences=tokenizer_obj.texts_to_sequences(corpus)

news_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_count = tokenizer_obj.word_index
print(f"no of unique words {len(word_count)}")

In [None]:
num_words = len(word_count) + 1
embedding_matrix = np.zeros((num_words, 100))

for word, i in tqdm(word_count.items()):
    if i > num_words:
        continue
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

## Baseline models

In [None]:
class CategoricalTruePositives(keras.metrics.Metric):

    def __init__(self, name='categorical_true_positives', **kwargs):
      super(CategoricalTruePositives, self).__init__(name=name, **kwargs)
      self.true_positives = self.add_weight(name='tp', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
      y_pred = tf.reshape(tf.argmax(y_pred, axis=1), shape=(-1, 1))
      values = tf.cast(y_true, 'int32') == tf.cast(y_pred, 'int32')
      values = tf.cast(values, 'float32')
      if sample_weight is not None:
        sample_weight = tf.cast(sample_weight, 'float32')
        values = tf.multiply(values, sample_weight)
      self.true_positives.assign_add(tf.reduce_sum(values))

    def result(self):
      return self.true_positives

    def reset_states(self):
      # The state of the metric will be reset at the start of each epoch.
      self.true_positives.assign(0.)


In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)




In [None]:
model.summary()

In [None]:
train = news_pad[:df.shape[0]]
test = news_pad[df.shape[0]:]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train,df['label'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
model.compile(loss='binary_crossentropy',optimizer=optimzer,)
history=model.fit(X_train,y_train,batch_size=256,epochs=20,validation_data=(X_test,y_test),verbose=2)


In [None]:
x_test = test['texts']
x_test = tokenizer_obj.texts_to_sequences(x_test)
testing_seq = pad_sequences(x_test,maxlen=50)
news_test = pad_sequences(testing_seq,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
test_pre = test.copy()
test_pre.head()

In [None]:
predict = model.predict_classes(news_test)
test_pre['label'] = predict
test_pre.head()

In [None]:
test_pre['label'].unique()

In [None]:
y_pre=model.predict(test)


In [None]:
y_pre


In [None]:
train