In [1]:

from nltk.corpus import stopwords 
import pandas as pd
import numpy as  np
import matplotlib as plt
import re
import string

#jupyter path
cols = ['sentiment','id','date','query_string','user','original_tweets']
df_encoding = "ISO-8859-1"
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding =df_encoding, header=None, names=cols)
df.drop(['id','date','query_string','user'],axis=1,inplace=True)
df = df.sample(frac=0.02, replace=True, random_state=1)
df.head()

Unnamed: 0,sentiment,original_tweets
128037,0,Oh really don't wanna be awake
491755,0,Trying to amuse my cousin. It's not working! a...
470924,0,@JonasAustralia i wanted to win! congrats to ...
491263,0,That's it!! I can't take it no more!! After su...
836489,4,@beckybootsx i hope your not drinking alcohol!...


## Define usefull functions


In [2]:
import nltk
#import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

nltk.download('stopwords')
nltk.download('wordnet')


# set stop words for english language
stop_words = set(stopwords.words("english"))

# function to remove punctuation
def remove_punctuations(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

def remove_stopwords(text):
    list_of_words = [word for word in text.split(' ') if word not in stop_words]
    words_to_text = " ".join(list_of_words)
    return words_to_text

def remove_numbers(text):
    clear_text = ''.join([i for i in text if not i.isdigit()])
    return clear_text

def do_lem(text):
    text = WordNetLemmatizer().lemmatize(text)
    return text

def clean_text(text):    
    # regex dictionary
    regex = {
        "urls": r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
        "mentions": r"@[A-Za-z0-9]+",
        "hashtags": r"#[A-Za-z0-9]+",
        "whitespaces": "\s+"
    }
    
    text = str(text).lower()
    text = re.sub(regex['urls'], '', text)
    text = re.sub(regex['mentions'], '', text)
    text = re.sub(regex['hashtags'], '', text)
    text = do_lem(text)
    text = remove_stopwords(text) 
    text = remove_punctuations(text)
    text = remove_numbers(text)
    text = re.sub(regex['whitespaces'], ' ', text).strip()
    return text

def create_subsets(dataset, features, labels, num_classes, train_size=0, valid_size=0, test_size=0):
    subsets = {}
    
    # Define a size for your train set 
    train_n = int(train_size * len(dataset))
    valid_n = int(valid_size * len(dataset))
    test_n = int(test_size * len(dataset))
    
    #train_test split
    subsets['X_train'] = dataset[features][:train_n]
    subsets['X_valid'] = dataset[features][train_n:train_n+valid_n]
    subsets['X_test'] = dataset[features][train_n+valid_n:train_n+valid_n+test_n]

    # Categorically encode labels
    subsets['Y_train'] = to_categorical(dataset[labels][:train_n].values, num_classes)
    subsets['Y_valid'] = to_categorical(dataset[labels][train_n:train_n+valid_n].values, num_classes)
    subsets['Y_test'] = to_categorical(dataset[labels][train_n+valid_n:train_n+valid_n+test_n].values, num_classes)
    return subsets

def words_to_sequences(max_sentence_length, subsets):
    seq_subsets = {}
    vocab_size = 0
    tokenizer = Tokenizer()
    for key, value in subsets.items():
        if key.startswith('X'):
            # create vocabulary based on word frequency
            #   -word_counts: Dictionary of words and their corresponding counts.
            #   -word_docs: Dictionary of words and their corresponding documents appeared in.
            #   -word_index: Dictionary of words and their uniquely assigned integers.
            #   -document_count: Count of the total number of documents that were used to fit the Tokenizer.
            if key.startswith('X_train'):
                tokenizer.fit_on_texts(list(value))
            
            # texts_to_sequences assigns integers to words for each document
            sequence = tokenizer.texts_to_sequences(value)
            # padding to prepare sequences of same length
            sequence = pad_sequences(sequence, maxlen = max_sentence_length)
            seq_subsets[key] = sequence
            
            if len(tokenizer.word_index) > vocab_size:
                vocab_size = len(tokenizer.word_index)

    return seq_subsets, vocab_size+1;

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ichristod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ichristod/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Cleaning

In [3]:
# copy dataframe
df_clean = df.copy(deep=True)

# clean text
df_clean['tweets'] = df_clean['original_tweets'].apply(clean_text)

# transform labels
df_clean['sentiment'] = df_clean['sentiment'].apply(lambda x: x if x<4 else 1)

# keep max length of words and sentences
words_length = max(len(w) for w in df_clean['tweets'])
sentence_length = max(len(w.split(' ')) for w in df_clean['tweets'])

print("max chars in a tweet:", words_length)
print("max num of words in a tweet:", sentence_length)

df_clean.head(100)

max chars in a tweet: 317
max num of words in a tweet: 29


Unnamed: 0,sentiment,original_tweets,tweets
128037,0,Oh really don't wanna be awake,oh really wanna awake
491755,0,Trying to amuse my cousin. It's not working! a...,trying amuse cousin working hes playing halo wo
470924,0,@JonasAustralia i wanted to win! congrats to ...,wanted win congrats anyways
491263,0,That's it!! I can't take it no more!! After su...,thats it cant take more summer school im talki...
836489,1,@beckybootsx i hope your not drinking alcohol!...,hope drinking alcohol lol
...,...,...,...
1053847,1,Breakfast with my mommy,breakfast mommy
992577,1,"1 tut down, 123981 projects to go!!",tut down projects go
275825,0,@melody1976 I'm jealous!! I have 4 weeks to w...,im jealous weeks wait
501507,0,'s heart is aching,s heart aching


In [4]:
# remove necessary columns & reset indexes
df_clean.drop(['original_tweets'],axis=1,inplace=True)
df_clean.reset_index(drop=True)

Unnamed: 0,sentiment,tweets
0,0,oh really wanna awake
1,0,trying amuse cousin working hes playing halo wo
2,0,wanted win congrats anyways
3,0,thats it cant take more summer school im talki...
4,1,hope drinking alcohol lol
...,...,...
31995,0,neither knees give
31996,1,perfect wedding perfect couple perfect shoot l...
31997,0,piggie lucky find hugs oxox
31998,1,ooh sorry last reply meant update song about


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from collections import defaultdict

features = 'tweets'
labels = 'sentiment'

#create appropriate subsets
initial_subsets = create_subsets(dataset=df_clean, features=features, labels=labels, 
                   train_size=0.6, valid_size=0.2, test_size=0.2, num_classes=2)

# convert features (text) to sequences
seq_subsets, vocab_size = words_to_sequences(sentence_length, subsets=initial_subsets)

In [6]:
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from keras.layers import GRU


embed_dim = 48
lstm_out = 120

model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=sentence_length,trainable=True)) 
model.add(LSTM(lstm_out, dropout=0.2,recurrent_dropout=0.2))
#model.add(GRU(100))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='Adamax',metrics = ['accuracy'])
model.summary()


model.fit(seq_subsets['X_train'], initial_subsets['Y_train'],validation_data = (seq_subsets['X_valid'],initial_subsets['Y_valid']),epochs = 20, batch_size=100)
model.evaluate(seq_subsets['X_test'],initial_subsets['Y_test'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 48)            1073424   
_________________________________________________________________
lstm (LSTM)                  (None, 120)               81120     
_________________________________________________________________
dense (Dense)                (None, 2)                 242       
Total params: 1,154,786
Trainable params: 1,154,786
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.7119863033294678, 0.7250000238418579]

In [7]:

#print(model.layers[0].get_weights()[0][1,2:])
