In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
import os
root_path = 'gdrive/My Drive/EACL/'
os.chdir(root_path)

Mounted at /content/gdrive


In [None]:
!pip install transformers
!pip install demoji
!pip install nltk

In [2]:
import numpy as np
import pandas as pd

import copy
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score 
from tqdm import tqdm
import demoji
import nltk
import string
import pickle
import math
import numpy as np
import sys
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

demoji.download_codes() 
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams.update({'font.size': 16})
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Downloading emoji data ...
... OK (Got response in 0.12 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [3]:
class Tokenizer():
    def __init__(self):
        self.index = {}
        self.tf_idf_index = {}
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))

    def remove_punc(self, text):
        return ''.join([ch for ch in text if str(ch).isalpha() or ch == ' '])
    
    def remove_stop(self, text):
        return ' '.join([word for word in text.lower().split() if word not in self.stopwords])
    
    def get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    def lemmatize(self, text):
        # return [self.wordnet_lemmatizer.lemmatize(w, self.get_wordnet_pos(w)) for w in nltk.word_tokenize(text)]
        return [self.wordnet_lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]

    def build_index(self, article_id, tokenized):
        for (idx, token) in enumerate(tokenized):
            if token not in self.index.keys():
                self.index[token] = {}
            if article_id not in self.index[token].keys():
                self.index[token][article_id] = []
            self.index[token][article_id].append(idx+1)

In [4]:
class Dataset():
    def __init__(self, train_data, val_data, tokenizer, batch_size = 32):
        # self.train_data = train_data
        # self.val_data = val_data
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.label_dict = {'Not_offensive': 0,
                    'Offensive_Targeted_Insult_Group': 3,
                    'Offensive_Targeted_Insult_Individual': 5,
                    'Offensive_Targeted_Insult_Other': 2,
                    'Offensive_Untargetede': 4,
                    'not-Kannada': 1}
        
        self.sentences_train = []
        self.sentences_test = []

        self.y_train = []
        self.y_test = []

        self.process_train(train_data)
        self.process_test(val_data)
        
        vectorizer = CountVectorizer()
        self.vec = vectorizer.fit(self.sentences_train)

        self.X_train = self.vec.transform(self.sentences_train)
        self.X_test = self.vec.transform(self.sentences_test)

    def process_train(self, data):  
        tokens = []

        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_train.append(' '.join(tokenized))
            self.y_train.append(label)
        
    def process_test(self, data):
        tokens = []
        
        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_test.append(' '.join(tokenized))
            self.y_test.append(label)

In [5]:
tokenizer = Tokenizer()
with open('Dataset/kannada_offensive_train.csv', 'r') as f:
    train_data = f.readlines()
with open('Dataset/kannada_offensive_dev.csv', 'r') as f:
    val_data = f.readlines()
data = Dataset(train_data, val_data, tokenizer)

In [8]:
mult_bayes_results = {}
ber_bayes_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)
K = [1000, 5000, 10000, X_train.shape[0]]

print(X_train.shape)
for k in K:
    X = SelectKBest(mutual_info_classif,k=k).fit(X_train,y_train)
    X_train_new = X.transform(X_train)
    X_test_new = X.transform(X_test)
    print(f'Running Bayes Models on k = {k}............')
    # best_feature_idxs = data.best_features[:k]
    # X_train_new = X_train
    # X_test_new = X_test
    
    clf = MultinomialNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    mult_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')
    
    clf = BernoulliNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    ber_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')
    print('Done')

print(mult_bayes_results)
print(ber_bayes_results)

(6217, 14054)
Running Bayes Models on k = 1000............
Done
Running Bayes Models on k = 5000............
Done
Running Bayes Models on k = 10000............
Done
Running Bayes Models on k = 6217............
Done
{1000: 0.6236195286550859, 5000: 0.6292425212070245, 10000: 0.6302799592802667, 6217: 0.6271811867071789}
{1000: 0.6226093559820003, 5000: 0.625301026952548, 10000: 0.5256274286389662, 6217: 0.6025827195167139}


In [6]:
svm_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)

alphas = [1, 0.1, 0.01, 0.001]
random_states = [5, 20, 40]
max_iters = [10, 15, 20]

print(X_train.shape)
for alpha in alphas:
    for random_state in random_states:
        for max_iter in max_iters:
            print(f'Running SVM Model on = {alpha, random_state, max_iter}............')
            X_train_new = X_train
            X_test_new = X_test

            clf = LinearSVC(C = alpha, random_state = random_state, max_iter = max_iter)
            clf.fit(X_train_new, y_train)
            y_pred = clf.predict(X_test_new)
            svm_results[(alpha, random_state, max_iter)] = f1_score(y_test, y_pred, average = 'weighted')

            print('Done')

print(svm_results)

(6217, 14054)
Running SVM Model on = (1, 5, 10)............




Done
Running SVM Model on = (1, 5, 15)............
Done
Running SVM Model on = (1, 5, 20)............
Done
Running SVM Model on = (1, 20, 10)............
Done
Running SVM Model on = (1, 20, 15)............
Done
Running SVM Model on = (1, 20, 20)............
Done
Running SVM Model on = (1, 40, 10)............
Done
Running SVM Model on = (1, 40, 15)............
Done
Running SVM Model on = (1, 40, 20)............
Done
Running SVM Model on = (0.1, 5, 10)............
Done
Running SVM Model on = (0.1, 5, 15)............
Done
Running SVM Model on = (0.1, 5, 20)............
Done
Running SVM Model on = (0.1, 20, 10)............
Done
Running SVM Model on = (0.1, 20, 15)............
Done
Running SVM Model on = (0.1, 20, 20)............
Done
Running SVM Model on = (0.1, 40, 10)............
Done
Running SVM Model on = (0.1, 40, 15)............
Done
Running SVM Model on = (0.1, 40, 20)............
Done
Running SVM Model on = (0.01, 5, 10)............
Done
Running SVM Model on = (0.01, 5, 15)........

In [7]:
rf_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)
K = [100, 200, 500, 1000]

print(X_train.shape)
for k in K:
#     X = SelectKBest(mutual_info_classif,k=k).fit(X_train,y_train)
#     X_train_new = X.transform(X_train)
#     X_test_new = X.transform(X_test)
    print(f'Running Bayes Models on k = {k}............')
#     # best_feature_idxs = data.best_features[:k]
    X_train_new = X_train
    X_test_new = X_test

    clf = RandomForestClassifier(n_estimators = k)
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    rf_results[k] = f1_score(y_test, y_pred, average = 'weighted')

    print('Done')

print(rf_results)

(6217, 14054)
Running Bayes Models on k = 100............
Done
Running Bayes Models on k = 200............
Done
Running Bayes Models on k = 500............
Done
Running Bayes Models on k = 1000............
Done
{100: 0.6121756364185973, 200: 0.616240200740094, 500: 0.6138774132457739, 1000: 0.6104310734534301}
