In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import spacy
import string
import nltk
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import pickle
import logging
#from bert_serving.client import BertClient

In [0]:
train=pd.read_csv('/content/train.csv')
test=pd.read_csv('/content/test.csv')

In [0]:
test.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [0]:
# data cleaning: remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

#removing twitterlinks
def remove_twitterlinks(stringliteral):
  return re.sub(r'pic.twitter.com\S+', '', stringliteral)
train['clean_tweet']=train['clean_tweet'].apply(remove_twitterlinks)
test['clean_tweet']=test['clean_tweet'].apply(remove_twitterlinks)

# remove twitter handles (@user)
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
  
# remove punctuation marks
punctuation = '.,\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~«»®´·º½¾¿¡§£₤'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

#Normalize the words to its base form
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

#train['clean_tweet'] = lemmatization(train['clean_tweet'])
#test['clean_tweet'] = lemmatization(test['clean_tweet'])

# remove special characters, numbers, punctuations
train['clean_tweet']=train['clean_tweet'].str.replace('iphoneonly', "iphone").str.replace('iphonesia', "iphone").str.replace('iphonex', "iphone").str.replace('goodvibes',"good vibes").str.replace('positivevibes', "positive vibes").str.replace('hateitunes', "hate itunes").str.replace('samsungblast', "samsung blast").str.replace('appleevent', "apple event").str.replace('applespecialevent', "apple special event")
test['clean_tweet']=test['clean_tweet'].str.replace('iphoneonly',"iphone").str.replace('iphonesia', "iphone").str.replace('iphonex', "iphone").str.replace('goodvibes',"good vibes").str.replace('positivevibes', "positive vibes").str.replace('hateitunes', "hate itunes").str.replace('samsungblast', "samsung blast").str.replace('appleevent', "apple event").str.replace('applespecialevent', "apple special event")

# remove special characters, numbers, punctuations
train['clean_tweet']=train['clean_tweet'].str.replace('swagswagswag', "swag").str.replace('appleisbest', "apple is best").str.replace('iphoneapps', "iphone apps").str.replace('iphonegraphy', "iphone")
test['clean_tweet']=test['clean_tweet'].str.replace('swagswagswag',"swag").str.replace('appleisbest', "apple is best").str.replace('iphoneapps', "iphone apps").str.replace('iphonegraphy', "iphone").str.replace('kissromancehotmovieiphonesexporn', "iphone")

#Removing Short Words
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [0]:
X=train['clean_tweet'].values
y=train['label'].values

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.20,random_state=42)

In [0]:
#pretrained a model
embedding="https://tfhub.dev/google/tf2-preview/nnlm-es-dim50-with-normalization/1"
hub_layer=hub.KerasLayer(embedding,input_shape=[],dtype=tf.string,trainable=True)

In [0]:
X_train=hub_layer(X_train)
X_valid=hub_layer(X_valid)

In [0]:
l2_model=keras.models.Sequential()
l2_model.add(hub_layer)
l2_model.add(keras.layers.Dense(32,kernel_regularizer=keras.regularizers.l2(0.001),activation='relu'))
l2_model.add(keras.layers.Dropout(0.5))
l2_model.add(keras.layers.Dense(1,activation='sigmoid'))

l2_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

l2_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 50)                48832000  
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1632      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 48,833,665
Trainable params: 48,833,665
Non-trainable params: 0
_________________________________________________________________


In [0]:
l2_model.fit(X_train,y_train,epochs=5,batch_size=128,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3c9a6bb8d0>

In [0]:
l2_model.evaluate(X_valid,y_valid)



[0.3160145580768585, 0.8926767706871033]

In [0]:
k=l2_model.predict(test['clean_tweet'])

In [0]:
actual=np.where(k>=0.5,1,0)

In [0]:
submission=pd.DataFrame()
submission['id']=test['id']
submission['label']=actual

In [0]:
submission.head()

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,1
4,7925,1


In [0]:
submission.to_csv('sample_01.csv',index=False)