In [ ]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, BatchNormalization, Flatten
from tensorflow.keras.regularizers import l2, l1, l1_l2
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_similarity_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
import pickle

In [ ]:
df = pd.read_csv('../Data/test.csv')
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


In [ ]:
def clean(row):
    row = row.replace('.', ' ')
    row = row.replace(',', '')
    row = row.replace("'", "")
    row = re.sub("\d+", "<NUM>", row)
    row = re.sub("\*+", "<CURSE>", row)
    row = re.sub("^@.*", "<USER>", row)
    row = re.sub("^#.*", "<HASH>", row)
    row = re.sub("^((https|http|ftp|file)?:\/\/).*", "<LINK>", row)
    row = re.sub("[0-9]+:[0-9]+(am|AM|pm|PM)?", "<DATE>", row)
    row = row.lower().strip()
    return row.split()
df["test_text_split"] = df.text.apply(lambda row: clean(str(row)))


In [ ]:
test_tokenized_text = tokenizer.texts_to_sequences(df.test_text_split)

In [ ]:
test_pad_token_text = pad_sequences(test_tokenized_text,maxlen=33, padding = "post")

In [ ]:
def Baseline(vocab_size):
    model = Sequential([
        Embedding(vocab_size, 128, input_length=33),
        Bidirectional(GRU(128, return_sequences=True, dropout=0.8, recurrent_dropout=0.8)),
        Bidirectional(GRU(128,return_sequences=True, dropout=0.8, recurrent_dropout=0.8)),
        BatchNormalization(),
        Dense(64, activation='elu',kernel_regularizer=l1_l2()),
        Dropout(0.8),
        Dense(2, activation='elu'),
        Flatten(),
        Dense(2, activation='elu')

    ])
    return model

In [ ]:
model = Baseline(20000)
model.load_weights("../Modeling/tweet_sentiment.hdf5")



In [ ]:
results = model.predict(test_pad_token_text)
results

array([[ 0.8405521,  5.6428237],
       [ 2.320581 , 11.565603 ],
       [ 2.1936672, 10.834363 ],
       ...,
       [ 3.288702 , 14.129215 ],
       [ 2.3037844, 11.0190525],
       [ 1.5861752,  6.3239083]], dtype=float32)

In [ ]:
results = np.round(results)
results

array([[ 1.,  6.],
       [ 2., 12.],
       [ 2., 11.],
       ...,
       [ 3., 14.],
       [ 2., 11.],
       [ 2.,  6.]], dtype=float32)

In [ ]:
df.head()

Unnamed: 0,textID,text,sentiment,test_text_split
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"[last, session, of, the, day, http://twitpic, ..."
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[shanghai, is, also, really, exciting, (precis..."
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[recession, hit, veronique, branquinho, she, h..."
3,01082688c6,happy bday!,positive,"[happy, bday!]"
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[http://twitpic, com/<num>w<num>p, -, i, like,..."


In [ ]:
df["final_split"] = df.text.apply(lambda x: x.split())

In [ ]:
def selecter(split_text,indices):
    try:
        return " ".join(split_text[int(indices[0][0]):int(indices[0][1])])
    except:
        return " ".join(split_text)

In [ ]:
df["selected_text"] = df.apply(lambda x: selecter(x.test_text_split,results), axis=1)

In [ ]:
df.head()

Unnamed: 0,textID,text,sentiment,test_text_split,final_split,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"[last, session, of, the, day, http://twitpic, ...","[Last, session, of, the, day, http://twitpic.c...",session of the day http://twitpic
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[shanghai, is, also, really, exciting, (precis...","[Shanghai, is, also, really, exciting, (precis...",is also really exciting (precisely
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[recession, hit, veronique, branquinho, she, h...","[Recession, hit, Veronique, Branquinho,, she, ...",hit veronique branquinho she has
3,01082688c6,happy bday!,positive,"[happy, bday!]","[happy, bday!]",bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[http://twitpic, com/<num>w<num>p, -, i, like,...","[http://twitpic.com/4w75p, -, I, like, it!!]",com/<num>w<num>p - i like it!!


In [ ]:
df.to_csv("submission.csv",index=None,columns=["textID","selected_text"])