In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
import re,unicodedata
from nltk.corpus import wordnet
from wordcloud import WordCloud
import contractions

from sklearn.model_selection import train_test_split

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.text import  Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.layers import Embedding,Dense,Bidirectional,LSTM, Dropout,SeparableConv1D,Flatten
from tensorflow.keras.utils import  plot_model
from tensorflow.keras.utils import to_categorical

In [None]:
base_path = '../input/nlp-getting-started/'
glove_file_path  = '../input/glovedata/glove.6B.200d.txt'
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

train_data['isTrain'] = ['True']* len(train_data)
test_data['isTrain'] = ['False'] * len(test_data)


In [None]:
combined_data = pd.concat([train_data,test_data])

# tweets from top 10 locations
locaition_value_counts = combined_data['location'].dropna().value_counts().to_frame()
value_counts_top = locaition_value_counts.iloc[0:10]
plt.pie(labels=value_counts_top.index,x=value_counts_top.location,autopct='%.0f%%')

In [None]:
# target label data distribution
sns.countplot(combined_data.target[combined_data.isTrain == 'True'])

In [None]:
# =============================================================================
# cleaning data
# =============================================================================

def remove_urls(line):
    urlPattern = '\s*https?:\/\/.*[\r\n]*'
    sample =  re.sub(urlPattern,'',line,flags=re.MULTILINE)
    return sample

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_tagged_users(words):    
    pattern = '(@\w+)'
    filtered_users = []
    for word in words:
        sample =  re.sub(pattern,'',word)
        if sample != '':
            filtered_users.append(sample)
    return filtered_users

def filter_stop_words(words,stop_words):
    no_stop_words = []
    full_sentence = ' '.join(words)
    for word in word_tokenize(full_sentence):
        if word not in stop_words:
            no_stop_words.append(word)
    return no_stop_words

def word_contractions(words):
    contractions_words = []
    pattern = '^&\w+\s*;$'
    for word in words:
        sample =  re.sub(pattern,'',word)
        if sample != '':
            contractions_words.append(contractions.fix(sample))
    return contractions_words

def remove_punctuations(words):
    non_punc_words = []
    pattern = '[^a-zA-Z\s]'
    for word in words:
        new_word = re.sub(pattern,'',word)
        if new_word != '':
            non_punc_words.append(new_word)
    return non_punc_words

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 

def lemmatize_word(words):
    lem_words = []
    lem = nltk.stem.WordNetLemmatizer()
    pos_of_words = nltk.pos_tag(words)
    for word,tag in pos_of_words:
        word = word.lower()
        if get_wordnet_pos(tag) != None:
            lem_word = lem.lemmatize(word=word,pos=get_wordnet_pos(tag))
            lem_words.append(lem_word)
        else:
            lem_word = lem.lemmatize(word=word)
            lem_words.append(lem_word)
    return lem_words

def correct_spellings(words):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(words)
    for word in words:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return corrected_text
        

def clean_tweets(df):
    df = df.copy()
    final_tweet = []
    stop_words = stopwords.words('english')
    for sentence in df['text']:
        ulr_removed_sentence = remove_urls(sentence)
        words = ulr_removed_sentence.split(' ')
        new_words = remove_non_ascii(words)
        non_usr_tagged_words = remove_tagged_users(new_words)
        contraction_words = word_contractions(non_usr_tagged_words)       
        no_stop_words = filter_stop_words(contraction_words,stop_words)
        non_punc_words = remove_punctuations(no_stop_words)
        # spell_check_words = correct_spellings(non_punc_words)
        lem_words = lemmatize_word(non_punc_words)
        pre_clean_sentence = ' '.join(lem_words)
        final_tweet.append(pre_clean_sentence)
    return final_tweet

In [None]:
cleaned_tweets = clean_tweets(combined_data)
combined_data['cleaned_tweets'] = cleaned_tweets

In [None]:
# filter tweets length has to to be greater than 1
# filtered_data = combined_data[combined_data.cleaned_tweets.str.len() > 1]
filtered_data = combined_data.copy()
filtered_data.drop(columns=['text','keyword','location'],inplace=True)
filtered_data.head()

In [None]:
cleaned_train_data = filtered_data[filtered_data.isTrain == 'True'].drop(columns='isTrain')
cleaned_test_data = filtered_data[filtered_data.isTrain == 'False'].drop(columns=['isTrain','target'])

In [None]:
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(filtered_data['cleaned_tweets']))
plt.imshow(wc)
plt.title('filtered_data')

In [None]:
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(cleaned_train_data['cleaned_tweets']))
plt.imshow(wc)
plt.title('cleaned_train_data')

In [None]:
wc = WordCloud(max_words = 1000 , width = 1600 , height = 1600,
               collocations=False).generate(" ".join(cleaned_test_data['cleaned_tweets']))
plt.imshow(wc)
plt.title('cleaned_test_data')

**---------**

In [None]:
max_features = 10000
maxlen = 50
embedding_size = 200

In [None]:
def prepareSequences(df):
  data = df.copy()
  tokenizer = Tokenizer(num_words=max_features,split=' ')
  tokenizer.fit_on_texts(data['cleaned_tweets'].values)
  sequences = tokenizer.texts_to_sequences(data['cleaned_tweets'].values)
  return (sequences,tokenizer)

In [None]:
sequence_train_data,tokenizer = prepareSequences(cleaned_train_data)

In [None]:
after_padding_sequence = pad_sequences(sequence_train_data,maxlen=maxlen,padding='post',truncating='post')

In [None]:
after_padding_sequence[0]

In [None]:
num_words = len(tokenizer.word_index)+1
num_words

In [None]:
""" Prepare embeding matrix using glove 200"""

embeddings = {}
for  w in open(glove_file_path):
  word  = w.split(" ")[0]
  emdb = w.split(" ")[1:]
  emdb = np.asarray(emdb,dtype='float32')
  embeddings[word] = emdb

embedding_matrx = np.zeros((num_words,embedding_size))
for word,i in tokenizer.word_index.items():
  if i < num_words:
    embdg_vector = embeddings.get(word)
    if embdg_vector is not None:
      embedding_matrx[i] = embdg_vector

In [None]:
y =  cleaned_train_data.target.values
y = to_categorical(y,dtype='int32')

In [None]:
# X_train,X_test,y_train,y_test = train_test_split(after_padding_sequence,y,test_size=0.2,random_state=42)

In [None]:
# print(X_train.shape,y_train.shape)
# print(X_test.shape,y_test.shape)
print(after_padding_sequence.shape)
print(y.shape)

In [None]:
# model building
def build_model():
  model = Sequential()
  model.add(Embedding(embedding_matrx.shape[0],embedding_size,input_length=maxlen,weights=[embedding_matrx]))
  model.add(Bidirectional(LSTM(12,dropout=0.2,return_sequences=False)))
  model.add(Flatten())
  model.add(Dense(10,activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(2,activation='softmax'))
  return model

In [None]:
model = build_model()
model.summary()

In [None]:
model.compile(optimizer='adamax',loss=tensorflow.losses.categorical_crossentropy,metrics=['accuracy'])

In [None]:
epochs = 50
batch_size = 32
history = model.fit(x=after_padding_sequence,y=y,validation_split=0.2,epochs=epochs,batch_size=batch_size,verbose=1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
test_seq = tokenizer.texts_to_sequences(cleaned_test_data['cleaned_tweets'])
padded =  pad_sequences(test_seq,maxlen=maxlen,padding='post',truncating='post')
pred = model.predict(padded)

In [None]:
preds = []
for p in pred:
  preds.append(np.argmax(p))


In [None]:
submission_csv = pd.DataFrame()
submission_csv['id'] = cleaned_test_data['id']
submission_csv['target'] = preds

In [None]:
submission_csv.to_csv(f'/kaggle/working/submission.csv',index=False)

In [None]:
submission_csv.head(25)