# Import Library

In [None]:
import pandas as pd
import numpy as np
import fasttext
import fasttext.util
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, f1_score
from keras.layers.pooling import MaxPooling2D

# Data Import and Preprocess 

In [None]:

# Reading CSV from link
def read_csv_from_link(url):
    path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
    df = pd.read_csv(path,delimiter="\t",error_bad_lines=False, header=None)
    return df

# Loading All Data
tamil_train = read_csv_from_link('https://drive.google.com/file/d/15auwrFAlq52JJ61u7eSfnhT9rZtI5sjk/view?usp=sharing')
tamil_dev = read_csv_from_link('https://drive.google.com/file/d/1Jme-Oftjm7OgfMNLKQs1mO_cnsQmznRI/view?usp=sharing')
tamil_test = read_csv_from_link('https://drive.google.com/file/d/10RHrqXvIKMdnvN_tVJa_FAm41zaeC8WN/view?usp=sharing')

# Tamil Preprocess
tamil_train = tamil_train.iloc[:, 0:2]
tamil_train = tamil_train.rename(columns={0: "text", 1: "label"})
tamil_dev = tamil_dev.iloc[:, 0:2]
tamil_dev = tamil_dev.rename(columns={0: "text", 1: "label"})

# Stats
tamil_train['label'] = pd.Categorical(tamil_train.label)
tamil_dev['label'] = pd.Categorical(tamil_dev.label)
print(tamil_train['label'].value_counts())

# Training Fasttext

In [None]:
import emoji
characters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','t','u','v','w','x','y','z']
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text
def preprocess(text):
    text = emoji.demojize(text, delimiters=("", ""))
    #text = convert_emoticons(text)
    res = text.lower()
    res = res.replace('_', ' ')
    res = res.replace('.', ' ')
    res = res.replace(',', ' ')
    res = res.strip()
    words = res.split()
    for i,word in enumerate(words):
        if(word[0] in characters):
            if(len(word)<3): continue
            while words[i][-1]==words[i][-2]:
                if(len(words[i])<2): break
                words[i] = words[i][:-1]
                if(len(words[i])<2): break
    sen = " ".join(words)
    return sen
    
train_text = []
for key, value in tamil_train['text'].iteritems(): 
  train_text.append(preprocess(value))

dev_text = []
for key, value in tamil_dev['text'].iteritems(): 
  dev_text.append(preprocess(value))
tamil_train['text'] = pd.DataFrame(train_text)
tamil_dev['text'] = pd.DataFrame(dev_text)

corpus = []
for i,sen in enumerate(tamil_train['text']):
    if(tamil_train[label][i]=='not-Tamil '):
        continue
    if i==0: continue
    corpus.append(preprocess(tamil_train['text'][i]))
for i,sen in enumerate(tamil_dev['text']):
    if(tamil_train[label][i]=='not-Tamil '):
        continue
    if i==0: continue
    corpus.append(preprocess(tamil_dev['text'][i]))

with open("corpus.txt", "w") as output:
    output.write(str(corpus))

# Train unsupervised skipgram model
unsuper_model = fasttext.train_unsupervised('/home/punyajoy/corpus.txt',"skipgram", dim=300)

# Train and Test set

In [None]:
# function to build vocabulary and inverse vocabulary dictionary
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

# Prepare X_train by replacing text with fasttext embeddings
def build_input_data(sentences, labels):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([np.array([unsuper_model.get_word_vector(word) for word in sentence]) for sentence in sentences])
    y = np.array(labels)
    return [x, y]

# padding sentence for uniform input size
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = 15
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i].strip()
        sentence = sentence.split(" ")
        if(len(sentence)> sequence_length):
            sentence = sentence[0:15]
            padded_sentences.append(sentence)
        else:
            num_padding = sequence_length - len(sentence)
            new_sentence = sentence + [padding_word] * num_padding
            padded_sentences.append(new_sentence)
    return padded_sentences

def load_data(train_text,label):
    """
    Loads and preprocessed data for the dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences_padded = pad_sentences(train_text)
    #vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, label)
    print(type(x))
    return [x, y]

# Loading train set
x_train, y_train = load_data(tamil_train["text"],tamil_train["label"])
# Encoding labels
x_train = np.asarray(x_train)
coded = dict({'Not_offensive':0, 'Offensive_Targeted_Insult_Group':1,
       'Offensive_Targeted_Insult_Individual':2,
     'Offensive_Untargetede':3,
       'not-Tamil' :4,
             'Offensive_Targeted_Insult_Other':5})
for i,j in enumerate(y_train):
    y_train[i] = coded[j]

x_train = x_train.reshape(x_train.shape[0], 15, 300,1)
from keras.utils import to_categorical
y_train = to_categorical(y_train)

# Loading dev set
x_dev, y_dev = load_data(tamil_dev["text"],tamil_dev["label"])
for i,j in enumerate(y_dev):
    y_dev[i] = coded[j]
    
x_dev = x_dev.reshape(x_dev.shape[0], 15, 300, 1)
y_dev = to_categorical(y_dev)

# Loading test set

x_test, y_test = load_data(tamil_test[0],tamil_test[1])
x_test = x_test.reshape(x_test.shape[0],15,300,1)
for i,j in enumerate(y_test):
    y_test[i] = coded[j]
y_test = to_categorical(y_test)

# CNN Model

In [None]:
from keras.layers.pooling import MaxPooling2D
inputs = Input(shape=(15, 300, 1))
#embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)

conv_0 = Conv2D(64, kernel_size=5, activation='relu', padding='valid')(inputs)
conv_1 = Conv2D(32, kernel_size=3, activation='relu', padding='valid')(conv_0)
conv_2 = Conv2D(32, kernel_size=3, activation='relu', padding='valid')(conv_1)
drop = Dropout(0.6)(conv_2)
conv_3 = Conv2D(16, kernel_size=3, activation='relu')(drop)
pool0 = MaxPooling2D(pool_size=(2, 2), padding='valid')(conv_1)
conv_4 = Conv2D(16, kernel_size=3, activation='relu')(pool0)
pool1 = MaxPooling2D(pool_size=(2, 2), padding='valid')(conv_2)
# pool2 = MaxPooling2D(pool_size=(2, 2), padding='valid')(conv_3)
# maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
# maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
# maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

# concatenated_tensor = Concatenate(axis=1)([pool0, pool1, pool2])
flatten = Flatten()(pool1)
hidden1 = Dense(128, activation='relu')(flatten)
output = Dense(6, activation='softmax')(hidden1)


# this creates a model that includes
model1 = Sequential()       # To train the model on dataset
model2 = Sequential()       # To extract embeddings from cnn layer
model1 = Model(inputs=inputs, outputs=output)
model2 = Model(inputs=inputs, outputs=hidden1)
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)


model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training the model
epoch = 50
cnt=0
best_f1 = 0
for i in range(epoch):
    model1.fit(x_train, y_train, epochs=1,validation_data = (x_dev,y_dev))
    pred = model1.predict(x_train)
    prediction = []
    for i,j in enumerate(pred):
        a = np.argmax(j)
        prediction.append(a)
    y_true =[]
    for i,j in enumerate(y_train):
        a = np.argmax(j)
        y_true.append(a)
    train_f1 = f1_score(y_true, prediction, average='weighted')
    print("train f1 - ",train_f1)
    
    
    pred = model1.predict(x_dev)
    prediction = []
    for i,j in enumerate(pred):
        a = np.argmax(j)
        prediction.append(a)

    y_true =[]
    for i,j in enumerate(y_dev):
        a = np.argmax(j)
        y_true.append(a)
    
    val_f1 = f1_score(y_true, prediction, average='weighted')
    # Updating best F1 socre and saving corresponding embeddings
    if(val_f1>best_f1):
        cnt =0
        best_f1 = val_f1
        x_train_dense_cnn = model2.predict(x_train)
        x_dev_dense_cnn = model2.predict(x_dev)
        x_test_dense_cnn = model2.predict(x_test)
        np.save('/home/punyajoy/Dravidian_Offensive_Classification/sentence_embeddings/cnn_emb_dev_128_tamil.npy',x_dev_dense_cnn)
        np.save('/home/punyajoy/Dravidian_Offensive_Classification/sentence_embeddings/cnn_emb_train_128_tamil.npy',x_train_dense_cnn)
        np.save('/home/punyajoy/Dravidian_Offensive_Classification/sentence_embeddings/cnn_emb_test_128_tamil.npy',x_test_dense_cnn)
        
    else:
        cnt+=1
    # loop break condition
    if(cnt>=7):
        print("NO increase for 5 itr, breaking....")
        break