In [1]:
import csv
import nltk
import re
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score


DATA_PATH="../data/"
TRAIN_DATA = DATA_PATH + "train_en.tsv"
TEST_DATA = DATA_PATH + "dev_en.tsv"

In [2]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter='\t')]
    return data

def getTweets(raw):
    data = [x[1] for x in raw if x[2] == '1']
    return np.array(data)

def getTarget(raw):
    classes = [x[3] for x in raw if x[2] == '1']
    return np.array(classes)

def getAggression(raw):
    classes = [x[4] for x in raw if x[2] == '1']
    return classes

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = re.sub(r'[^a-zA-Z]', " ", tweet) # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [4]:
en_stopwords = set(stopwords.words("english")) 

raw_train = readData(TRAIN_DATA) 
train_tweets = getTweets(raw_train)
train_tweets = preprocess(train_tweets)
Y_train_aggression = getAggression(raw_train)
Y_train_target = getTarget(raw_train)

raw_test = readData(TEST_DATA)
test_tweets = getTweets(raw_test)
test_tweets = preprocess(test_tweets)
Y_test_aggression = getAggression(raw_test)
Y_test_target = getTarget(raw_test)

## Data Analysis

In [4]:
# train_tweets_count = len(train_tweets)
# test_tweets_count = len(test_tweets)
# # print("total number of train tweets: %s", train_tweets_count)

# train_tweets_with_aggression = np.sum(Y_train_aggression)
# test_tweets_with_aggression = np.sum(Y_test_aggression)


# train_tweets_without_aggression = train_tweets_count - train_tweets_with_aggression
# test_tweets_without_aggression = test_tweets_count - test_tweets_with_aggression



## Embeddings Word  Level and Char  Level

In [8]:
def embeddings(typ):
    if typ == "word":
        vectorizer = CountVectorizer(
            analyzer = 'word',
            lowercase = True,
            ngram_range=(1, 3),
            stop_words = en_stopwords)
        vectorizer.fit(train_tweets)
    else:
        vectorizer = CountVectorizer(
            analyzer = 'char',
            tokenizer = tokenize,
            lowercase = True,
            ngram_range=(2, 6),
            stop_words = en_stopwords)
        vectorizer.fit(train_tweets)
    return vectorizer


## Classifier

In [9]:
def model(name, c, g):
    if name == "LR":
        classifier = LogisticRegression(C=c, solver='sag')
    else:
        classifier = SVC(C=c, gamma=g)
    return classifier

## Task Type

In [10]:
def task_type(name):
    if name == "TC":
        classifier.fit(train_features, Y_train_target)
        y_predict = classifier.predict(test_features)
        evaluate(Y_test_target, y_predict)
    else:
        classifier.fit(train_features, Y_train_aggression)
        y_predict = classifier.predict(test_features)
        evaluate(Y_test_aggression, y_predict)

# Task 1: Target Classification

## Word Level and Logistic Regression

In [17]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 10, None)
task_type("TC")

F1 score:    0.9062584314529618
Avg Recall:  0.9073476466455919
Accuracy:    0.9063231850117096




## Char Level and Logistic Regression

In [22]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 1, None)
task_type("TC")

F1 score:    0.9110119066885867
Avg Recall:  0.9115516332982087
Accuracy:    0.9110070257611241




## Word Level and SVM

In [23]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 10, 0.01)
task_type("TC")

F1 score:    0.9110206959568599
Avg Recall:  0.9111894099051633
Accuracy:    0.9110070257611241


## Char Level and SVM

In [26]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 10, 0.01)
task_type("TC")

F1 score:    0.5480425788996247
Avg Recall:  0.6184799789251845
Accuracy:    0.6088992974238876


# Task 2: Aggression Detection

## Word Level and Logistic Regression

In [32]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 10, None)
task_type("AD")



F1 score:    0.6808455581452934
Avg Recall:  0.679613998065594
Accuracy:    0.6814988290398126


## Char Level and Logistic Regression

In [35]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 0.1, None)
task_type("AD")



F1 score:    0.6737261541309245
Avg Recall:  0.6724698848149124
Accuracy:    0.6744730679156908


## Word Level and SVM

In [43]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.7041761259666591
Avg Recall:  0.7030247076409039
Accuracy:    0.7072599531615925


## Char Level and SVM

In [46]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.6652486819897004
Avg Recall:  0.6657983821331224
Accuracy:    0.6651053864168618
