In [1]:
# https://github.com/jupyter/nbconvert/issues/503#issuecomment-600095120
if '_' in globals():
    # OPTIONAL: Load the "autoreload" extension so that code can change
    %load_ext autoreload

    # OPTIONAL: always reload modules so that as you change code in src, it gets loaded
    %autoreload 2
else:
    print("Not running magic outside of IPython")

In [2]:
import src
from src.util.notebook_imports import *
from src.util.dataset import *

[nltk_data] Downloading package stopwords to /Users/dlite/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
unable to import 'smart_open.gcs', disabling that module
Using TensorFlow backend.


## Data Cleaning
As we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.So let's start.

In [3]:
tweet, test = dataset_to_df()
df=pd.concat([tweet,test])
df.shape

(10876, 5)

### Removing urls

In [4]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [5]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

'New competition launched :'

In [6]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

### Removing HTML tags

In [7]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [8]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [9]:
df['text']=df['text'].apply(lambda x : remove_html(x))

### Romoving Emojis

In [10]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [11]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))


### Removing punctuations

In [12]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

I am a king


In [13]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

## GloVe for Vectorization

Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.

In [14]:

def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus
        
        

In [15]:
corpus=create_corpus(df)

100%|██████████| 10876/10876 [00:01<00:00, 6289.39it/s]


In [16]:
embedding_dict={}
with open('../data/raw/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [17]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [18]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 20342


In [19]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
            

100%|██████████| 20342/20342 [00:00<00:00, 424710.57it/s]


In [20]:
# pd.to_csv("data/processed/em", header=None, index=None)
tweet_pad[0]

array([ 622, 5467,  738,  175,   80, 1805, 3529,   16,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

## Baseline Model

In [21]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])



In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2034300   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,076,605
Trainable params: 42,305
Non-trainable params: 2,034,300
_________________________________________________________________


In [23]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [24]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [25]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/15
 - 46s - loss: 0.6911 - accuracy: 0.5724 - val_loss: 0.6890 - val_accuracy: 0.5622
Epoch 2/15
 - 49s - loss: 0.6541 - accuracy: 0.6220 - val_loss: 0.5707 - val_accuracy: 0.7548
Epoch 3/15
 - 45s - loss: 0.5807 - accuracy: 0.7183 - val_loss: 0.5340 - val_accuracy: 0.7583
Epoch 4/15
 - 45s - loss: 0.5681 - accuracy: 0.7337 - val_loss: 0.5261 - val_accuracy: 0.7697
Epoch 5/15
 - 45s - loss: 0.5590 - accuracy: 0.7404 - val_loss: 0.5199 - val_accuracy: 0.7706
Epoch 6/15
 - 45s - loss: 0.5517 - accuracy: 0.7464 - val_loss: 0.5141 - val_accuracy: 0.7697
Epoch 7/15
 - 49s - loss: 0.5522 - accuracy: 0.7432 - val_loss: 0.5104 - val_accuracy: 0.7785
Epoch 8/15
 - 49s - loss: 0.5548 - accuracy: 0.7466 - val_loss: 0.5084 - val_accuracy: 0.7828
Epoch 9/15
 - 63s - loss: 0.5422 - accuracy: 0.7589 - val_loss: 0.5044 - val_accuracy: 0.7837
Epoch 10/15
 - 58s - loss: 0.5416 - accuracy: 0.7544 - val_loss: 0.5019 - val_accuracy: 0.7872
Epoch 11/15

## Making our submission

In [26]:
sample_sub=pd.read_csv('../data/raw/sample_submission.csv')

In [27]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('../data/processed/submission.csv',index=False)


In [28]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
