In [None]:
import os
import math
import multiprocessing
import numpy as np
import pandas as pd
import random as rn
import tensorflow as tf
import matplotlib.pyplot as plt
from evaluate import plot_confusion_matrix,calculate_performance_metrics
import nltk
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec as w2v



SEED = 123456
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
rn.seed(SEED)

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
class word2vec:
    def __init__(self, tweet_file=None,tweets=None, num_features=100, min_word_count=3,context_size=7,downsampling=1e-3,seed=1,epochs=12):
        self.file = tweet_file
        self.tweets = tweets
        self.num_features = num_features
        self.min_word_count = min_word_count
        self.context_size = context_size
        self.downsampling = downsampling
        self.epochs=epochs
        self.seed = seed

    def preprocess_tweets(self):
        #Tokenizes tweets to words
        raw_sentences = []
        if self.file is not None:
            tweets = open(self.file, "r",encoding="utf8")
        else:
            tweets = self.tweets
        for tweet in tweets:
            raw_sentences.append(nltk.word_tokenize(tweet))
        self.sentences = raw_sentences


    def make_model(self):
        #Train model for word2vec vectors on your dataset
            self.tweet2vec = w2v(
            sg = 1,
            seed = self.seed,
            workers = multiprocessing.cpu_count(),
            size = self.num_features,
            min_count = self.min_word_count,
            window = self.context_size,
            sample = self.downsampling
        )

        # Build the vocabulary
        self.tweet2vec.build_vocab(self.sentences)
        # Train the model
        self.tweet2vec.train(self.sentences, epochs = 12, total_examples = len(self.sentences))

    def run(self):
        self.preprocess_tweets()
        self.make_model()

In [None]:
df = pd.read_csv('datasets/balanced_data.csv',index_col=False,sep=',')
df.head()

In [None]:
def clean_data(text):
    import re
    HASHTAGS_REGEX = re.compile('#')
    text = HASHTAGS_REGEX.sub('', text)

    MENTIONS_REGEX = re.compile('@[^\s]+')
    text = MENTIONS_REGEX.sub('', text)
    
    LINK_REGEX = re.compile('https?://[^\s]+')
    text = LINK_REGEX.sub('', text)

    puncs = '!"$%^&*()_+~-={}|[]\:";<>,.?/'+'0123456789'
    temp = str.maketrans(dict.fromkeys(puncs,""))
    text=text.translate(temp)
    
    temp= str.maketrans(dict.fromkeys("'`","")) #to preserve can't as cant
    text = text.translate(temp)
    
    clean_text = re.sub(u'[\u007B-\uFFFF]','',text)
    return clean_text.lower()

In [None]:
df.text = df.text.map(clean_data)
df.head()

In [None]:
tot_classes = len(set(df.emoji))
print(tot_classes)

## Training word-embeddings

In [None]:
VECTOR_SIZE=100

w2vec = word2vec(tweets = df.text, num_features=100, min_word_count=3,context_size=5,downsampling=1e-3,seed=1,epochs=1000)
w2vec.run()


In [None]:
#similar words in our vocab
w = 'red'
print(w2vec.tweet2vec.wv.most_similar(positive=w))

In [None]:
tweets = list(df.text)
labels = list(df.emoji)

N = int(0.9*len(tweets))

all_train_tweets = tweets[:N]
all_train_labels = labels[:N]

test_tweets = tweets[N:]
test_labels = labels[N:]

val_N = int(0.9*len(all_train_tweets))

train_tweets = all_train_tweets[:val_N]
train_labels = all_train_labels[:val_N]

val_tweets = all_train_tweets[val_N:]
val_labels = all_train_labels[val_N:]

In [None]:
def get_vector(li):
    features = []
    max_len = 0
    for counter, tweet in enumerate(li):
        avg_vec = np.zeros(w2vec.tweet2vec.wv.vector_size)
        max_len = max(max_len, len(tweet))
        for word in tweet:
            if word not in w2vec.tweet2vec.wv.vocab:
                continue
            avg_vec = np.add(avg_vec, w2vec.tweet2vec.wv[word])
        features.append(np.true_divide(avg_vec, len(tweet)))
    return np.asarray(features), max_len

In [None]:
all_tweets = all_train_tweets + test_tweets
max_length = math.ceil(sum([len(s.split(" ")) for s in all_tweets])/len(all_tweets))


In [None]:
X_train, max_len_train = get_vector(train_tweets)
X_all_train,max_len_all_train = get_vector(all_train_tweets)
X_test, max_len_test = get_vector(test_tweets) 
X_val, max_len_val_test = get_vector(val_tweets) 


In [None]:
y_all_train = np_utils.to_categorical(all_train_labels, tot_classes)
y_train = np_utils.to_categorical(train_labels, tot_classes)
y_val = np_utils.to_categorical(val_labels, tot_classes)
y_test = np_utils.to_categorical(test_labels, tot_classes)

## Multilayer Perceptron - MLP

In [None]:
model = Sequential()
model.add(Dense(400, activation='relu', input_dim=VECTOR_SIZE))
model.add(Dropout(0.25))
#model.add(Dense(444, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(22, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(tot_classes, activation='softmax'))

adam = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.99)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

print(model.summary())
#score = model.evaluate(X_test, y_test, batch_size=128)

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4000, batch_size=X_all_train.shape[0]//100)

In [None]:
mlp_pred = model.predict_classes(X_test)
calculate_performance_metrics(test_labels,mlp_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))

In [None]:
keras.backend.clear_session()

## Support Vector Machine - SVM

In [None]:
from sklearn.svm import LinearSVC
svm = LinearSVC(random_state=0)
svm.fit(X_all_train, all_train_labels)
svm_pred = svm.predict(X_test)
calculate_performance_metrics(test_labels,svm_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))

## Decision Tree - DT

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_all_train, all_train_labels)
dt_pred = dt.predict(X_test)
calculate_performance_metrics(test_labels,dt_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))

## Random Forest Classifier - RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=0)
rf.fit(X_all_train, all_train_labels)
rf_pred = rf.predict(X_test)
calculate_performance_metrics(test_labels,rf_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))

## K Nearest Neighbours - KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 500)
knn.fit(X_all_train, all_train_labels)
knn_pred = knn.predict(X_test)
calculate_performance_metrics(test_labels,knn_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))

## Baseline 

In [None]:
from baseline import BaseLine

m = BaseLine()    
m.fit(X_all_train, all_train_labels)
base_pred = m.predict(X_test)
calculate_performance_metrics(test_labels,base_pred,p_average="weighted",
                              r_average="weighted",f1_average="weighted",normalize_cm="true",figsize=(7,7))