# Tweet Classifier
[tweets](https://www.kaggle.com/kazanova/sentiment140)


# Utilities, Constants

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
import tensorflow as tf
from tensorflow.keras import layers
import nlp

In [23]:
data = pd.read_csv('data/tweets.csv', encoding = 'Latin-1', names=('sentiment','id','date','flag','username','tweet'))
#data = data[['sentiment', 'tweet']]

In [26]:
def shorten_dataset(data, n=5000):
    """ Shuffle and shorten the dataset in case you want to test """
    data = data.sample(frac=1)
    data = data.head(n)
    return data
data = shorten_dataset(data, 50000)

Unnamed: 0,sentiment,id,date,flag,username,tweet


In [4]:
import nltk, re, string
#nltk.download("stopwords")
MAXLEN = 35
VOCAB_SIZE = 10000

def change_sentiments(data):
    """ The sentiment from the input data is 0 for unhappy, 4 for happy. Let's change the 4 to 1 """
    data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x==4 else 0)
    return data

def clean_tweet(tweet:str):
    """ Removing punctuation, hashtags, lowercasing everything. The link remover needs to be fixed, as it currently deletes every word after the link """
    tweet = tweet.lower()
    #tweet = re.sub(r'https?:\/\/.*[\r\n]* ', '', str(tweet)) #TODO: fix this link remover, it currently deletes everything beyond the link
    tweet = re.sub(r'#', '', str(tweet)) #remove hashtab
    
    #remove punctuation
    punct = set(string.punctuation)
    tweet = "".join(ch for ch in tweet if ch not in punct)
    return tweet

def make_numpy(in_data):
    """turns the relevant columns in pandas dataframe into numpy arrays"""
    tweets = in_data['tweet'].to_numpy()
    sentiments = in_data['sentiment'].to_numpy()
    return tweets, sentiments

def seq_padder(tokenizer, tweets):
    """tokenizes and pads the tweets"""
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    sequences = tokenizer.texts_to_sequences(tweets)
    print(sequences[0])
    padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=MAXLEN)
    return padded

def make_tokenizer(vocab_size):
    """ make the tokenizer """
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')
    tokenizer.fit_on_texts(tweets)
    return tokenizer

def create_model():   
    """ Create a bidirectional LSTM model for sentiment analysis """
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, 16, input_length=MAXLEN),
        tf.keras.layers.Dropout(.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, return_sequences=True)),
        layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40)),
        layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def batch_data(X_train, y_train, X_test, y_test):
    """Shuffle and batch the training and test data"""
    BATCH_SIZE = 512
    BUFFER_SIZE = 10000
    dataset_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset_test = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    dataset_train = dataset_train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    dataset_test = dataset_test.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    return dataset_train, dataset_test

In [6]:
data = change_sentiments(data)
data['tweet'] = data['tweet'].apply(clean_tweet)
tweets, sentiments = make_numpy(data)
tokenizer = make_tokenizer(VOCAB_SIZE)
padded_tweets = seq_padder(tokenizer, tweets)

[6946, 242, 394, 10, 20, 235, 187, 56, 54, 25, 36]


In [7]:
model = create_model()
X_train, X_test, y_train, y_test = train_test_split(padded_tweets, sentiments, test_size=.05)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
dataset_train, dataset_test = batch_data(X_train, y_train, X_test, y_test)

In [8]:
h = model.fit(dataset_train, validation_data = dataset_test, epochs=20, 
              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=2)]
             )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
18/92 [====>.........................] - ETA: 9s - loss: 0.3799 - accuracy: 0.8320 

KeyboardInterrupt: 

In [17]:
example = ['This is not great, not awesome']
ex_seq = seq_padder(tokenizer, example)
model.predict(ex_seq)

[28, 9, 26, 101, 26, 153]


array([[0.5350593]], dtype=float32)