In [1]:

from nltk.corpus import stopwords 
import pandas as pd
import numpy as  np
import matplotlib as plt
import re
import string

#jupyter path
cols = ['sentiment','id','date','query_string','user','original_tweets']
df_encoding = "ISO-8859-1"
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding =df_encoding, header=None, names=cols)
df.drop(['id','date','query_string','user'],axis=1,inplace=True)
df = df.sample(frac=0.05, replace=True, random_state=1)
df.head()

Unnamed: 0,sentiment,original_tweets
128037,0,Oh really don't wanna be awake
491755,0,Trying to amuse my cousin. It's not working! a...
470924,0,@JonasAustralia i wanted to win! congrats to ...
491263,0,That's it!! I can't take it no more!! After su...
836489,4,@beckybootsx i hope your not drinking alcohol!...


##  Tweets Preprocessing
   ###   Cleaning


In [2]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = set(stopwords.words("english"))

df_clean = df.copy(deep=True)

# regex to identify URLs
regex_url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
# regex to identify mentions
regex_mention = r"@[A-Za-z0-9]+"
# regex to identify hashtags
regex_hashtags = r"#[A-Za-z0-9]+"
# # regex to identify leading & trailing whitespaces
regex_whitespaces = "\s+"
# set stop words for english language
stop_words = set(stopwords.words('english')) 

def remove_punctuations(text):
    filtered_text = text.translate(str.maketrans('', '', string.punctuation))
    return filtered_text

def Find(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r'[^\w\s,]'
    url = re.findall(regex,string)      
    return [x[0] for x in url]

def remove_stopwords(text):
    list_of_words = [word for word in text.split(' ') if word not in stop_words]
    words_to_text = " ".join(list_of_words)
    return words_to_text

def remove_numbers(text):
    clear_text = ''.join([i for i in text if not i.isdigit()])
    return clear_text

def clean_text(text):
    # transform all tweets to lowercase
    text = str(text).lower()
    text = re.sub(regex_url, '', text)
    text = re.sub(regex_mention, '', text)
    text = re.sub(regex_hashtags, '', text)
    text = remove_stopwords(text) 
    text = remove_punctuations(text)
    text = remove_numbers(text)
    text = re.sub(regex_whitespaces, ' ', text).strip()

    return text

df_clean['tweets'] = df_clean['original_tweets'].apply(clean_text)
df_clean['sentiment'] = df_clean['sentiment'].apply(lambda x: x if x<4 else 1)

words_length = max(len(w) for w in df_clean['tweets'])
sentence_length = max(len(w.split(' ')) for w in df_clean['tweets'])

print("max chars in a tweet:", words_length)
print("max num of words in a tweet:", sentence_length)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ichristod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


max chars in a tweet: 317
max num of words in a tweet: 33


In [3]:
df_clean.head(100)

Unnamed: 0,sentiment,original_tweets,tweets
128037,0,Oh really don't wanna be awake,oh really wanna awake
491755,0,Trying to amuse my cousin. It's not working! a...,trying amuse cousin working hes playing halo wo
470924,0,@JonasAustralia i wanted to win! congrats to ...,wanted win congrats anyways
491263,0,That's it!! I can't take it no more!! After su...,thats it cant take more summer school im talki...
836489,1,@beckybootsx i hope your not drinking alcohol!...,hope drinking alcohol lol
...,...,...,...
1053847,1,Breakfast with my mommy,breakfast mommy
992577,1,"1 tut down, 123981 projects to go!!",tut down projects go
275825,0,@melody1976 I'm jealous!! I have 4 weeks to w...,im jealous weeks wait
501507,0,'s heart is aching,s heart aching


In [4]:
df_clean.drop(['original_tweets'],axis=1,inplace=True)
df_clean.reset_index(drop=True)

Unnamed: 0,sentiment,tweets
0,0,oh really wanna awake
1,0,trying amuse cousin working hes playing halo wo
2,0,wanted win congrats anyways
3,0,thats it cant take more summer school im talki...
4,1,hope drinking alcohol lol
...,...,...
79995,0,nk agreemade sad hear
79996,1,milano awwww cute whats name
79997,1,hey amazing voice love hair accent please twee...
79998,1,hi sent email prefect blazer hope helps


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


# Define a size for your train set 
train_size = int(0.6 * len(df_clean))
validation_size = int(0.2 * len(df_clean))
test_size = int(0.2 * len(df_clean))

num_classes = 2

#train_test split
X_train = df_clean['tweets'][:train_size]
X_validation = df_clean['tweets'][train_size:]
X_test = df_clean['tweets'][train_size:]

# Categorically encode labels
Y_train = to_categorical(df_clean['sentiment'][:train_size].values, num_classes)
Y_test = to_categorical(df_clean['sentiment'][train_size:].values, num_classes)

# create vocabulary based on word frequency
#   -word_counts: Dictionary of words and their corresponding counts.
#   -word_docs: Dictionary of words and their corresponding documents appeared in.
#   -word_index: Dictionary of words and their uniquely assigned integers.
#   -document_count: Count of the total number of documents that were used to fit the Tokenizer.
tokenizer_train = Tokenizer()
tokenizer_test = Tokenizer()
tokenizer_train.fit_on_texts(list(X_train))
tokenizer_test.fit_on_texts(list(X_test))

# texts_to_sequences assigns integers to words for each document
X_train_seq  = tokenizer_train.texts_to_sequences(X_train) 
X_test_seq = tokenizer_test.texts_to_sequences(X_test)

# padding to prepare sequences of same length
X_train_seq  = pad_sequences(X_train_seq, maxlen=sentence_length)
X_test_seq = pad_sequences(X_test_seq, maxlen=sentence_length)

In [18]:
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


embed_dim = 128
lstm_out = 256
vocab_size = max(len(tokenizer_train.word_index),len(tokenizer_train.word_index))+1

model = Sequential()
model.add(Embedding(vocab_size,embed_dim,input_length=sentence_length,trainable=True)) 
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.summary()

Le = LabelEncoder()
y = Le.fit_transform(df_clean['sentiment'])
model.fit(X_train_seq, Y_train,validation_data = (X_test_seq,Y_test),epochs = 10, batch_size=128)
model.evaluate(X_test_seq,Y_test)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 33, 128)           5151488   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               788480    
_________________________________________________________________
dense_5 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 1026      
Total params: 6,203,650
Trainable params: 6,203,650
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 