In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.util import ngrams

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix

from collections import defaultdict
from collections import Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout, Bidirectional
from keras.initializers import Constant
from keras.optimizers import Adam

In [None]:
os.listdir('../input')

### read data

In [None]:
# train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
train_df = pd.read_csv('../input/nlpaug-augmented-data/train_augmented.csv')

### create embeddings dictionary by loading Twitter GloVe (27B tokens) with 100-D vectors

In [None]:
import io

embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors

### tweets augmentation by replacing words with their synonyms using "nlpaug" library

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

aug = naw.SynonymAug(aug_src='wordnet')

def augmented_data (text, aug):
	augmented_text2 = aug.augment(text)
	augmented_text3 = aug.augment(text)

	return augmented_text2,augmented_text3,

for i in range(train_df.shape[0]):
	text = train_df["text"][i] 
	label = train_df["target"][i]

	id = int('0'+str(train_df["id"][i]))
	augmented1, augmented2 = augmented_data(text, aug)
	df1 = pd.DataFrame({"id":[id, id],"keyword":["",""],"location":["",""],"text":[augmented1, augmented2],"target":[label, label]})       
	train_df = train_df.append(df1,ignore_index = True)

print(train_df.shape[0])
train_df.to_csv("train_augmented.csv")

### build vocabulary from text

In [None]:
def build_vocab(texts):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

### check how much of the vocabulary is covered by the embeddings

In [None]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
vocab = build_vocab(train_df['text'])
oov = check_coverage(vocab,embedding_dict)
oov[:10]

### text preprocessing on train and test dataframes

In [None]:
df = pd.concat([train_df,test_df])

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',str(text))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def lower(text):
    words = text.split(" ")
    lower = " ".join([w.lower() for w in words])
    return lower

df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x: remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(lambda x : lower(x))

### transform the data to input for LSTM network; truncate sentences more than 32 words long; create embedding matrix

In [None]:
MAX_LEN = 32
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(df['text'].values.tolist()) 
sequences = tokenizer_obj.texts_to_sequences(df['text'].values)
text_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
word_index = tokenizer_obj.word_index

num_words = len(word_index)+1
dim = 100

unknown_words = np.random.uniform(-1,1,size=dim).astype('float32')
unknown_words = unknown_words.reshape(1,dim)

embedding_matrix = np.zeros((num_words, dim))
for word, i in tqdm(word_index.items()):    
    if i > num_words:
        continue
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
    else:
        embedding_matrix[i]=unknown_words

embedding_matrix.shape

### define model; a bidirectional LSTM layer

In [None]:
model=Sequential()

embedding = Embedding(num_words,dim,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(Bidirectional(LSTM(256, dropout=0.25, recurrent_dropout=0.2)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

### define callbacks; ModelCheckpoint to save the best model; EarlyStopping to reduce training epochs if no improvement found

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
def get_callbacks():
    path_checkpoint ='checkpoint_keras.h5'  
    log_dir='logs'   
    callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                         monitor='val_accuracy',
                                         verbose=1,
                                         save_weights_only=False,
                                         save_best_only=True,
                                         mode='max',
                                         period=1)
    callback_early_stopping = EarlyStopping(monitor='val_accuracy',
                                           patience=5,
                                           verbose=1)
    callbacks = [callback_checkpoint, callback_early_stopping]
    return callbacks

### split data into training and validation data with 80-20 split and start training

In [None]:
train = text_pad[:train_df.shape[0]]
test = text_pad[train_df.shape[0]:]

X_train,X_test,y_train,y_test=train_test_split(train,train_df['target'].values,test_size=0.2,random_state=40)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

history=model.fit(X_train,y_train,
                  batch_size=64,
                  epochs=25,
                  validation_data=(X_test,y_test),
                  verbose=1,
                  callbacks = get_callbacks())

### print evaluation metrics on validation data

In [None]:
from sklearn import metrics

y_pred = model.predict(X_test)
y_pred = y_pred.round().astype('int')

print(metrics.accuracy_score(y_test,y_pred))
print(metrics.confusion_matrix(y_test,y_pred))
print(metrics.classification_report(y_test,y_pred))

### predict the labels for tweets in the test data

In [None]:
train_pred_GloVe = model.predict(test)
train_pred_GloVe_int = train_pred_GloVe.round().astype('int')

submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission['target'] = train_pred_GloVe_int
submission.to_csv("LSTM_Glove_non_augmented.csv", index=False, header=True)

### weighted probability scores implementation; use "keyword" column to weigh a real/fake tweet

In [None]:
unique_keywords = train_df.keyword.unique()
unique_locations = train_df.keyword.unique()

dict_keyword_target = {}
dict_location_target = {}

for key in unique_keywords:
    dict_keyword_target[key] = train_df[train_df['keyword']==key].target.values
for key in unique_locations:
    dict_location_target = train_df[train_df['keyword']==key].target.values

train_df['keyword'].fillna('empty',inplace=True)
ans = train_df[train_df['keyword']!='empty'].shape[0]
print(ans)

dict_keyword_prob = {}
dict_location_prob = {}
for key in dict_keyword_target.keys():
    cnt = 0
    for i in range(len(dict_keyword_target[key])):
        if(dict_keyword_target[key][i]==1):
            cnt += 1
    if(len(dict_keyword_target[key])!=0):
        dict_keyword_prob[key] = cnt/len(dict_keyword_target[key])

model_predictions = model.predict(test)
final_predictions = []
count = 0
test_df['keyword'].fillna('empty',inplace=True)
dict_keyword_prob['empty'] = 1
change_cnt_to_0 = 0
change_cnt_to_1 = 0

for i in range(len(model_predictions)):
    count += 1
    if(((dict_keyword_prob[test_df['keyword'].tolist()[i]])+model_predictions[i])/2 <= 0.5):
        if(model_predictions[i]>0.5):
            change_cnt_to_0 += 1
        final_predictions.append(0)
    else:
        if(model_predictions[i]<=0.5):
            change_cnt_to_1 += 1
        final_predictions.append(1)

submission['target'] = final_predictions
submission.to_csv("LSTM_Glove_aug_weighted.csv", index=False, header=True)