### Importing required Libraries.

In [None]:
import re
import os
import gensim
import string
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import defaultdict
from collections import  Counter
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from keras.optimizers import Adam

## Loading the data and getting basic idea 

In [None]:
tweet= pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')

print(tweet.head(3))

print(f'There are {tweet.shape[0]} rows and {tweet.shape[1]} columns in train')
print(f'There are {test.shape[0]} rows and {test.shape[1]} columns in train')

#### Class distribution

In [None]:
import plotly.express as px
target_counts = tweet.target.value_counts()
fig = px.bar(target_counts, x=target_counts.index, y=target_counts)
fig.show()

#### Number of characters in tweets

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
    go.Histogram(x=tweet[tweet['target']==1]['text'].str.len()),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(x=tweet[tweet['target']==0]['text'].str.len()),
    row=1,
    col=2
)

fig.show()

#### Number of words in a tweet

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
    go.Histogram(x=tweet[tweet['target']==1]['text'].str.split().map(lambda x: len(x))),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(x=tweet[tweet['target']==0]['text'].str.split().map(lambda x: len(x))),
    row=1,
    col=2
)
fig.show()

###  Average word length in a tweet

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2)
word = tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
fig.add_trace(
    go.Histogram(x=word.map(lambda x: np.mean(x))),
    row=1,
    col=1
)
word = tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
fig.add_trace(
    go.Histogram(x=word.map(lambda x: np.mean(x))),
    row=1,
    col=2
)
fig.show()

In [None]:
def create_corpus(target):
    corpus=[]
    for x in tweet[tweet['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

#### Common stopwords in tweets

In [None]:
corpus=create_corpus(0)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 


In [None]:
import plotly.express as px
x,y=zip(*top)
fig = px.bar(x=x,y=y)
fig.show()

Now,we will analyze tweets with class 1.

In [None]:
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 

In [None]:
import plotly.express as px
x,y=zip(*top)
fig = px.bar(x=x,y=y)
fig.show()

In both of them,"the" dominates which is followed by "a" in class 0 and "in" in class 1.

#### Analyzing punctuations

In [None]:
corpus=create_corpus(1)
special = string.punctuation
dic=defaultdict(int)
for i in (corpus):
    if i in special:
        dic[i]+=1

import plotly.express as px
x,y=zip(*dic.items())
fig = px.bar(x=x,y=y)
fig.show()

In [None]:
corpus=create_corpus(0)
special = string.punctuation
dic=defaultdict(int)
for i in (corpus):
    if i in special:
        dic[i]+=1

import plotly.express as px
x,y=zip(*dic.items())
fig = px.bar(x=x,y=y)
fig.show()

#### Common words

In [None]:
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:40]:
    if (word not in stop) :
        x.append(word)
        y.append(count)

In [None]:
import plotly.express as px
fig = px.bar(x=x, y=y)
fig.show()

#### bigram analysis

In [None]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
top_tweet_bigrams=get_top_tweet_bigrams(tweet['text'])[:30]
x,y=map(list,zip(*top_tweet_bigrams))

import plotly.express as px
fig = px.bar(x=x,y=y)
fig.show()

### Data Cleaning

In [None]:
df=pd.concat([tweet,test])
df.shape

#### Removing urls

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"
remove_URL(example)

In [None]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

#### Removing HTML tags

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""
print(remove_html(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

### Romoving Emojis

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

### Removing punctuations

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

#### Spelling Correction


In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

In [None]:
#df['text']=df['text'].apply(lambda x : correct_spellings(x)#)

### GloVe for Vectorization

Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.

In [None]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

corpus=create_corpus(df)

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
            

## Baseline Model

In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])



In [None]:
model.summary()

In [None]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

## Making our submission

In [None]:
sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)


In [None]:
sub.head()