**IMPORT DATA AND LIBRARIES**

In [1]:
import pandas as pd
import numpy as np

# Reading CSV from link
def read_csv_from_link(url):
    path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
    df = pd.read_csv(path,delimiter="\t",error_bad_lines=False, header=None)
    return df

# Loading All Data
tamil_train = read_csv_from_link('https://drive.google.com/file/d/15auwrFAlq52JJ61u7eSfnhT9rZtI5sjk/view?usp=sharing')
tamil_dev = read_csv_from_link('https://drive.google.com/file/d/1Jme-Oftjm7OgfMNLKQs1mO_cnsQmznRI/view?usp=sharing')
# mal_train = read_csv_from_link('https://drive.google.com/file/d/13JCCr-IjZK7uhbLXeufptr_AxvsKinVl/view?usp=sharing')
# mal_dev = read_csv_from_link('https://drive.google.com/file/d/1J0msLpLoM6gmXkjC6DFeQ8CG_rrLvjnM/view?usp=sharing')
# kannada_train = read_csv_from_link('https://drive.google.com/file/d/1XuOhSpdK8qsbO-lZHrIcVaU5FsCXc05T/view?usp=sharing')
# kannada_dev = read_csv_from_link('https://drive.google.com/file/d/164zYZOeXIwt5jl3NggJU0CWRyD2fRT9z/view?usp=sharing')

In [2]:
# Tamil Preprocess
tamil_train = tamil_train.iloc[:, 0:2]
tamil_train = tamil_train.rename(columns={0: "text", 1: "label"})
tamil_dev = tamil_dev.iloc[:, 0:2]
tamil_dev = tamil_dev.rename(columns={0: "text", 1: "label"})

# Stats
tamil_train['label'] = pd.Categorical(tamil_train.label)
tamil_dev['label'] = pd.Categorical(tamil_dev.label)
print(tamil_train['label'].value_counts())

Not_offensive                           25425
Offensive_Untargetede                    2906
Offensive_Targeted_Insult_Group          2557
Offensive_Targeted_Insult_Individual     2343
not-Tamil                                1454
Offensive_Targeted_Insult_Other           454
Name: label, dtype: int64


**PREPROCESS_DATA**

In [3]:
import string
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def hasNumbers(inputString):
  return any(char.isdigit() for char in inputString)

def process(text):
  text = text.lower()
  text = remove_emoji(text)
  table = str.maketrans('', '', string.punctuation)
  stripped = text.translate(table)
  words = stripped.split(" ")
  now_text = ""
  for word in words:
    if(not hasNumbers(word)):
      now_text += word+" " 
  return now_text

train_text = []
for key, value in tamil_train['text'].iteritems(): 
  train_text.append(process(value))

dev_text = []
for key, value in tamil_dev['text'].iteritems(): 
  dev_text.append(process(value))
tamil_train['text'] = pd.DataFrame(train_text)
tamil_dev['text'] = pd.DataFrame(dev_text)

**LOADING SENTENCE EMBEDDINGS**

In [41]:
x_dense_train = np.load('../sentence_embeddings/cnn_skipgram_emb_train.npy')
x_dense_test = np.load('../sentence_embeddings/cnn_skipgram_emb_test.npy')

In [10]:
y_train = np.array(tamil_train["label"])
y_test = np.array(tamil_dev["label"])

In [11]:
coded = dict({'Not_offensive':0, 'Offensive_Targeted_Insult_Group':1,
       'Offensive_Targeted_Insult_Individual':2,
       'Offensive_Targeted_Insult_Other':3, 'Offensive_Untargetede':4,
       'not-Tamil' :5})

for i,j in enumerate(y_train):
    y_train[i] = coded[j]
for i,j in enumerate(y_test):
    y_test[i] = coded[j]

**TESTING THE EMBEDDINGS ON ML MODELS AS BASELINES**

**XGBOOST**

In [37]:
import xgboost as xgb
dtrain = xgb.DMatrix(x_dense_train, label=y_train)
dtest = xgb.DMatrix(x_dense_test, label=y_test)
param = {'max_depth':4 , 'eta': 0.15, 'objective': 'multi:softprob','lambda':1.5,'num_class':6}
param['nthread'] = 15
param['eval_metric'] = 'merror'

evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 300
bst = xgb.train(param, dtrain, num_round, evallist)

#ypred = bst.predict(x_testf)

[0]	eval-merror:0.27484	train-merror:0.12428
[1]	eval-merror:0.27279	train-merror:0.10831
[2]	eval-merror:0.26914	train-merror:0.10288
[3]	eval-merror:0.26892	train-merror:0.09585
[4]	eval-merror:0.27119	train-merror:0.09189
[5]	eval-merror:0.26869	train-merror:0.08811
[6]	eval-merror:0.26914	train-merror:0.08629
[7]	eval-merror:0.26960	train-merror:0.08438
[8]	eval-merror:0.26914	train-merror:0.08284
[9]	eval-merror:0.26869	train-merror:0.08091
[10]	eval-merror:0.26846	train-merror:0.07798
[11]	eval-merror:0.26823	train-merror:0.07536
[12]	eval-merror:0.26823	train-merror:0.07402
[13]	eval-merror:0.26869	train-merror:0.07149
[14]	eval-merror:0.26732	train-merror:0.06972
[15]	eval-merror:0.26664	train-merror:0.06767
[16]	eval-merror:0.26846	train-merror:0.06651
[17]	eval-merror:0.26892	train-merror:0.06483
[18]	eval-merror:0.26983	train-merror:0.06295
[19]	eval-merror:0.26983	train-merror:0.06076
[20]	eval-merror:0.27028	train-merror:0.05908
[21]	eval-merror:0.27165	train-merror:0.0577

In [38]:
from sklearn.metrics import classification_report
ypred = bst.predict(xgb.DMatrix(x_dense_test))
prediction = []
for i,j in enumerate(ypred):
    a = np.argmax(j)
    prediction.append(a)

print(classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3377
           1       0.26      0.27      0.27       286
           2       0.22      0.28      0.25       246
           3       0.02      0.06      0.02        17
           4       0.28      0.31      0.29       323
           5       0.68      0.84      0.75       139

    accuracy                           0.72      4388
   macro avg       0.39      0.43      0.41      4388
weighted avg       0.75      0.72      0.73      4388



**RANDOM FOREST**

In [50]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=30, random_state=5,n_estimators = 500 )
clf.fit(x_dense_train, y_train)
pred = clf.predict(x_dense_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.91      0.81      0.86      3578
           1       0.25      0.32      0.28       230
           2       0.19      0.32      0.24       185
           3       0.00      0.00      0.00         8
           4       0.23      0.32      0.27       250
           5       0.68      0.85      0.76       137

    accuracy                           0.74      4388
   macro avg       0.38      0.44      0.40      4388
weighted avg       0.80      0.74      0.76      4388



**SVM**

In [51]:
from sklearn import svm
clf = svm.SVC()
clf.fit(x_dense_train, y_train)
pred = clf.predict(x_dense_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3344
           1       0.26      0.27      0.27       287
           2       0.22      0.28      0.25       244
           3       0.02      0.04      0.02        25
           4       0.30      0.32      0.31       339
           5       0.69      0.80      0.74       149

    accuracy                           0.72      4388
   macro avg       0.39      0.42      0.41      4388
weighted avg       0.74      0.72      0.73      4388



**LOGISTIC REGRESSION**

In [52]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_dense_train, y_train)
pred = clf.predict(x_dense_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3346
           1       0.26      0.27      0.27       291
           2       0.22      0.27      0.24       252
           3       0.02      0.03      0.02        29
           4       0.29      0.31      0.30       328
           5       0.68      0.82      0.75       142

    accuracy                           0.72      4388
   macro avg       0.39      0.42      0.40      4388
weighted avg       0.74      0.72      0.73      4388



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
