# Tweet Classifier
[tweets](https://www.kaggle.com/kazanova/sentiment140)


# Utilities, Constants

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
import tensorflow as tf
from tensorflow.keras import layers
import nlp

In [32]:

#data = data[['sentiment', 'tweet']]

In [82]:
import nltk, re, string
#nltk.download("stopwords")
MAXLEN = 35
VOCAB_SIZE = 10000

def change_sentiments(data):
    """ The sentiment from the input data is 0 for unhappy, 4 for happy. Let's change the 4 to 1 """
    data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x==4 else 0)
    return data

def clean_tweet(tweet:str):
    """ Removing punctuation, hashtags, lowercasing everything. The link remover needs to be fixed, as it currently deletes every word after the link """
    tweet = tweet.lower()
    #tweet = re.sub(r'https?:\/\/.*[\r\n]* ', '', str(tweet)) #TODO: fix this link remover, it currently deletes everything beyond the link
    tweet = re.sub(r'#', '', str(tweet)) #remove hashtab
    
    #remove punctuation
    punct = set(string.punctuation)
    tweet = "".join(ch for ch in tweet if ch not in punct)
    return tweet

def make_numpy(in_data):
    """turns the relevant columns in pandas dataframe into numpy arrays"""
    tweets = in_data['tweet'].to_numpy()
    sentiments = in_data['sentiment'].to_numpy()
    return tweets, sentiments

def preprocess(tokenizer, tweets):
    """tokenizes and pads the tweets. note that tweets is a list of strings"""
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    sequences = tokenizer.texts_to_sequences(tweets)
    padded = pad_sequences(sequences, truncating='post', padding='post', maxlen=MAXLEN)
    return padded

def make_tokenizer(vocab_size):
    """ make the tokenizer """
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<UNK>')
    tokenizer.fit_on_texts(tweets)
    return tokenizer

def save_tokenizer(tokenizer):
    """save the tokenizer for future use"""
    import pickle
    with open('src/models/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_tokenizer():
    with open('src/models/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

def create_model():   
    """ Create a bidirectional LSTM model for sentiment analysis """
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, 16, input_length=MAXLEN),
        tf.keras.layers.Dropout(.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, return_sequences=True)),
        layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40)),
        layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def shorten_dataset(data, n=5000):
    """ Shuffle and shorten the dataset in case you want to test """
    data = data.sample(frac=1)
    data = data.head(n)
    return data

def batch_data(X_train, y_train, X_test, y_test):
    """Shuffle and batch the training and test data"""
    BATCH_SIZE = 512
    BUFFER_SIZE = 10000
    dataset_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset_test = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    dataset_train = dataset_train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    dataset_test = dataset_test.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    return dataset_train, dataset_test

In [57]:
data = pd.read_csv('data/tweets.csv', encoding = 'Latin-1', names=('sentiment','id','date','flag','username','tweet'))
#data = pd.read_csv('https://jtctweetdata.s3.us-east-1.amazonaws.com/tweets.csv?response-content-disposition=inline&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEKT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMiJGMEQCIGpKZAv9QCUNci2DP37nydyAGzstLxqg9ibz8kzTyk5AAiA%2BObLbBl8QocsbFm3cV39KStJRb%2FIlCheXWpLyIuKuvCqeAgi%2B%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDc0MjA0ODQ2MzU2OCIMN5LUZT%2BhmWJb8RBCKvIBrD%2FROVZv1iq1V60URzynfAMVs1fQ4yk2vP%2FYFUzEsJv26m%2Fsk2HgtpdR8kHfDAfLKGcSXoiYH%2B7sM7c3wGfA%2BaoHLJujHSHYco2Coqizs8qB22vBFDbrysVoMZav29RP0BHg8D%2FvziQxPUqqBZoBl3y34vebIUvzNDBongYjeYwnVnSTaH8tz2oFlU4QOKv%2FfjTWsSB5yqFQybdD%2FDmPtzpDetZL66C3geu%2Ft%2BFFOJbWTkKT3f7vmMy61exbEsZwRycGrXrVdo7TLZFo1O4qHO%2FUzfywQX6aO2HNdOteMLxtbQiZ1FsX900cpCualWkAClowisOVjwY64AG2MJX8mbs4NvsJBsrYN45NijEqeMtsuHa29VyNNY5fv0T1dLBD%2Ff0bP4KhS1VbqZnJ0h%2BpMDRTorIbn8GJUtP0AdyG8Xvn6w4sFku90nNoO3T2bZhAcrtBJid8p%2BDMMwToTk6rKNfRm69mhKbuGEISx%2B6N7JeezOYRU0plnsx6vh%2Bdo7C4Zff1YQIZQizgCZdgQKnlzig6CjbuDrF6tNfPdUn9inslb1UX%2B6TxqsR2pijKwIcihITOttvJr02Xc3GTXSWS4XFBZAq4axyqYdntdqDghae1EZTCnxzVhyj2qg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220117T124319Z&X-Amz-SignedHeaders=host&X-Amz-Expires=432000&X-Amz-Credential=ASIA2ZRMI7LILP54HZFI%2F20220117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=4a7f3960910c6ce5184a5dbb7210b007511fe2045ad1abffae5db915baf382c7', encoding = 'Latin-1', names=('sentiment','id','date','flag','username','tweet'))
data = shorten_dataset(data, 50000)

In [58]:
data = change_sentiments(data)
data['tweet'] = data['tweet'].apply(clean_tweet)
tweets, sentiments = make_numpy(data)
tokenizer = make_tokenizer(VOCAB_SIZE)
padded_tweets = preprocess(tokenizer, tweets)

In [59]:
save_tokenizer(tokenizer)

In [60]:
model = create_model()
X_train, X_test, y_train, y_test = train_test_split(padded_tweets, sentiments, test_size=.05)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
dataset_train, dataset_test = batch_data(X_train, y_train, X_test, y_test)

In [61]:
h = model.fit(dataset_train, validation_data = dataset_test, epochs=2, 
              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=2)]
             )

Epoch 1/2
Epoch 2/2


In [62]:
model.save('src/models/tweet_classifier.h5')

In [66]:
saved_model = tf.keras.models.load_model('src/models/tweet_classifier.h5')
saved_tokenizer = load_tokenizer()

In [70]:
test_tweets = [tweets[0]]

array([[ 28, 176,   9, 298, 995,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

array([[0.7171427]], dtype=float32)

'this show is such self parody '

'jazzminnie but ive done barely any revision so many things have bugged me that have completely put me off it '