In [119]:
import csv
import nltk
import re
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score


DATA_PATH="../data/"
TRAIN_DATA = DATA_PATH + "train_en.tsv"
TEST_DATA = DATA_PATH + "dev_en.tsv"

In [120]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter='\t')]
    return data

def getTweets(raw):
    data = [x[1] for x in raw if x[2] == '1']
    return np.array(data)

def getTarget(raw):
    classes = [x[3] for x in raw if x[2] == '1']
    return np.array(classes)

def getAggression(raw):
    classes = [x[4] for x in raw if x[2] == '1']
    return classes

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = re.sub(r'[^a-zA-Z]', " ", tweet) # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [121]:
en_stopwords = set(stopwords.words("english")) 

raw_train = readData(TRAIN_DATA) 
train_tweets = getTweets(raw_train)
aggression_classes_train = getAggression(raw_train)
target_classes_train = getTarget(raw_train)
tweets = preprocess(tweets)
X_train = train_tweets
Y_train_target = target_classes_train
Y_train_aggression = aggression_classes_train

raw_test = readData(TEST_DATA)
test_tweets = getTweets(raw_test)
Y_test_aggression = getAggression(raw_test)
Y_test_target = getTarget(raw_test)



In [122]:
print(len(train_tweets))
print(len(Y_train_aggression))
print(len(Y_train_target))

3783
3783
3783


In [123]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)
vectorizer.fit(train_tweets)
train_features = vectorizer.transform(train_tweets)
test_features = vectorizer.transform(test_tweets)

In [124]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [114]:
classifier = SVC(C = 0.1)

In [125]:
classifier.fit(train_features, Y_train_target)
y_predict = classifier.predict(test_features)
evaluate(Y_test_target, y_predict)

F1 score:    0.8966860617400402
Avg Recall:  0.8985774499473129
Accuracy:    0.8969555035128806


In [126]:
classifier.fit(train_features, Y_train_aggression)
y_predict = classifier.predict(test_features)
evaluate(Y_test_aggression, y_predict)

F1 score:    0.6605751244372501
Avg Recall:  0.6593796711509716
Accuracy:    0.6627634660421545
