In [78]:
import csv
import nltk
import re
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score


DATA_PATH="../data/"
TRAIN_DATA = DATA_PATH + "train_en.tsv"
TEST_DATA = DATA_PATH + "dev_en.tsv"

In [79]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter='\t')]
    return data

def getTweets(raw):
    data = [x[1] for x in raw if x[2] == '1']
    return np.array(data)

def getTarget(raw):
    classes = [x[3] for x in raw if x[2] == '1']
    return np.array(classes)

def getAggression(raw):
    classes = [x[4] for x in raw if x[2] == '1']
    return classes

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data, removeMention):
    cleanData = []
    for tweet in data:
        if removeMention:
            tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = re.sub(r'[^a-zA-Z]', " ", tweet) # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [80]:
en_stopwords = set(stopwords.words("english")) 

raw_train = readData(TRAIN_DATA) 
train_tweets_dirty = getTweets(raw_train)
train_tweets = preprocess(train_tweets_dirty, True)
train_tweets_with_mentions = preprocess(train_tweets_dirty, False)
Y_train_aggression = getAggression(raw_train)
Y_train_target = getTarget(raw_train)

raw_test = readData(TEST_DATA)
test_tweets_dirty = getTweets(raw_test)
test_tweets = preprocess(test_tweets_dirty, True)
test_tweets_with_mentions = preprocess(test_tweets_dirty, False)
Y_test_aggression = getAggression(raw_test)
Y_test_target = getTarget(raw_test)

## Data Analysis

In [81]:
# train_tweets_count = len(train_tweets)
# test_tweets_count = len(test_tweets)
# # print("total number of train tweets: %s", train_tweets_count)

# train_tweets_with_aggression = np.sum(Y_train_aggression)
# test_tweets_with_aggression = np.sum(Y_test_aggression)


# train_tweets_without_aggression = train_tweets_count - train_tweets_with_aggression
# test_tweets_without_aggression = test_tweets_count - test_tweets_with_aggression



## Embeddings Word  Level and Char  Level

In [82]:
def embeddings(typ):
    if typ == "word":
        vectorizer = CountVectorizer(
            analyzer = 'word',
            lowercase = True,
            ngram_range=(1, 3),
            stop_words = en_stopwords)
        vectorizer.fit(train_tweets)
    else:
        vectorizer = CountVectorizer(
            analyzer = 'char',
            tokenizer = tokenize,
            lowercase = True,
            ngram_range=(2, 6),
            stop_words = en_stopwords)
        vectorizer.fit(train_tweets)
    return vectorizer


## Classifier

In [83]:
def model(name, c, g):
    if name == "LR":
        classifier = LogisticRegression(C=c, solver='sag')
    else:
        classifier = SVC(C=c, gamma=g)
    return classifier

## Task Type

In [84]:
def task_type(name):
    if name == "TC":
        classifier.fit(train_features, Y_train_target)
        y_predict = classifier.predict(test_features)
        evaluate(Y_test_target, y_predict)
    else:
        classifier.fit(train_features, Y_train_aggression)
        y_predict = classifier.predict(test_features)
        evaluate(Y_test_aggression, y_predict)

# Task 1: Target Classification

## Word Level and Logistic Regression

In [85]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 10, None)
task_type("TC")

F1 score:    0.9062584314529618
Avg Recall:  0.9073476466455919
Accuracy:    0.9063231850117096




In [86]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("LR", 10, None)
task_type("TC")

F1 score:    0.906282077053579
Avg Recall:  0.9072269055145767
Accuracy:    0.9063231850117096




## Char Level and Logistic Regression

In [87]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 1, None)
task_type("TC")

F1 score:    0.9063283228300912
Avg Recall:  0.9068646821215314
Accuracy:    0.9063231850117096




In [88]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("LR", 1, None)
task_type("TC")

F1 score:    0.9110187405010414
Avg Recall:  0.9114308921671935
Accuracy:    0.9110070257611241




## Word Level and SVM

In [89]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 10, 0.01)
task_type("TC")

F1 score:    0.9110206959568599
Avg Recall:  0.9111894099051633
Accuracy:    0.9110070257611241


In [90]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("SVM", 10, 0.01)
task_type("TC")

F1 score:    0.9110158152871344
Avg Recall:  0.9110686687741483
Accuracy:    0.9110070257611241


## Char Level and SVM

In [91]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 100, 0.001)
task_type("TC")

F1 score:    0.8825371874839769
Avg Recall:  0.8846373375482964
Accuracy:    0.882903981264637


In [92]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("SVM", 100, 0.001)
task_type("TC")

F1 score:    0.9062306636852869
Avg Recall:  0.907468387776607
Accuracy:    0.9063231850117096


# Task 2: Aggression Detection

## Word Level and Logistic Regression

Without Mentions

In [93]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 10, None)
task_type("AD")



F1 score:    0.6808455581452934
Avg Recall:  0.679613998065594
Accuracy:    0.6814988290398126


With Mentions

In [94]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("LR", 10, None)
task_type("AD")



F1 score:    0.6664190053283398
Avg Recall:  0.6651169436384419
Accuracy:    0.667447306791569


## Char Level and Logistic Regression

Without Mentions

In [95]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("LR", 0.1, None)
task_type("AD")



F1 score:    0.6737261541309245
Avg Recall:  0.6724698848149124
Accuracy:    0.6744730679156908


With Mentions

In [96]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("LR", 0.1, None)
task_type("AD")



F1 score:    0.664884276704065
Avg Recall:  0.6639189308010199
Accuracy:    0.6651053864168618


## Word Level and SVM

Without Mentions

In [97]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.7041761259666591
Avg Recall:  0.7030247076409039
Accuracy:    0.7072599531615925


With Mentions

In [98]:
vectorizer = embeddings("word")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.6874528426386869
Avg Recall:  0.6864943286731733
Accuracy:    0.6908665105386417


## Char Level and SVM

Without Mentions

In [99]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.6652486819897004
Avg Recall:  0.6657983821331224
Accuracy:    0.6651053864168618


With Mentions

In [100]:
vectorizer = embeddings("char")
train_features = vectorizer.transform(train_tweets_with_mentions)
test_features = vectorizer.transform(test_tweets_with_mentions)
classifier = model("SVM", 100, 0.001)
task_type("AD")

F1 score:    0.6697892271662763
Avg Recall:  0.6711179987690143
Accuracy:    0.6697892271662763
