In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from nltk.metrics import *

### Train data and Test data with Cleaning

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

STOPWORDS = set(stopwords.words('english_amz'))
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
LINKS =re.compile('(?i)(http(s)?[:/.a-z0-9]*)')
CHAR_OCC = re.compile(r'(.)\1{2,}')
#LINKS = re.compile("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
def clean_text(text):
    text = text.lower() # lowercase text
    text = LINKS.sub('', text)
    text = CHAR_OCC.sub(r'\1',text) # replace more than 2 occurences of a same char like aaaaabbbbbbb to ab
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text = re.sub("@[a-zA-Z]*", "", text) # remove usernames with @******
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub("\'", "", text) # remove backslash-apostrophe 
    text = re.sub("[^a-zA-Z]"," ",text) # remove everything except alphabets
    text = re.sub("( |^)[a-zA-Z][a-zA-Z]? "," ",text) # remove words with just 1 or 2 letters
    text = ' '.join(text.split()) # remove whitespaces 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text


df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

### Training svm model for TASK1

In [187]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm 
from sklearn.metrics import classification_report

In [188]:
# adding label column
df_train['label_t1'] = df_train['task1'].apply(lambda x: 0 if x=='NOT' else 1)
df_test['label_t1'] = df_test['task1'].apply(lambda x: 0 if x=='NOT' else 1)

In [189]:
X_train = df_train['text'].values
y_train = df_train['label_t1'].values

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)

SVM_Task1 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)
SVM_Task1.fit(X_train_cv, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [190]:
predictions_SVM_Task1 = SVM_Task1.predict(cv.transform(df_test['text'].values))
df_test['svm-task1'] = predictions_SVM_Task1.tolist()
print(classification_report(df_test['label_t1'],predictions_SVM_Task1))

             precision    recall  f1-score   support

          0       0.89      0.93      0.91       370
          1       0.92      0.87      0.90       338

avg / total       0.90      0.90      0.90       708



In [191]:
## threshold based
svm_pred = SVM_Task1.predict_proba(cv.transform(df_test['text'].values))
threshold = 0.6
thresh = lambda x: 'HOF' if x[1]>threshold else 'NOT'
y_pred = [thresh(x) for x in svm_pred]
print(classification_report(df_test['task1'],y_pred))

             precision    recall  f1-score   support

        HOF       0.93      0.86      0.89       338
        NOT       0.88      0.94      0.91       370

avg / total       0.90      0.90      0.90       708



### Training with logistic regression

In [192]:
from sklearn.linear_model import LogisticRegression

In [224]:
X_train = df_train['text'].values
y_train = df_train['label_t1'].values

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)

LR_Task1 = LogisticRegression()
LR_Task1.fit(X_train_cv, y_train)

predictions_LR_Task1 = LR_Task1.predict(cv.transform(df_test['text'].values))
df_test['lr-task1'] = predictions_LR_Task1.tolist()
print(classification_report(df_test['label_t1'],predictions_LR_Task1))

             precision    recall  f1-score   support

          0       0.89      0.95      0.92       370
          1       0.94      0.87      0.90       338

avg / total       0.91      0.91      0.91       708



### Loading rules with cleaning

In [194]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

stemmer = PorterStemmer() 
#ls = LancasterStemmer()

In [198]:
offense_words = set()
offense_words_stemmed = set()
with open('bad-words.txt') as f:
    for line in f:
        offense_words.add(clean_text(line.replace("\n",'')))
        offense_words_stemmed.add(stemmer.stem(clean_text(line.replace("\n",''))))
offense_words_stemmed = [x for x in offense_words_stemmed if x != '']
offense_words = [x for x in offense_words if x != '']

In [199]:
# def contains_offensive_words(text,count,stem):
#     text_set = set(word_tokenize(text))
#     text_set_stemmed = set([stemmer.stem(x) for x in word_tokenize(text)])
#     if stem:
#         matched_words = text_set_stemmed.intersection(offense_words_stemmed)
#     else:
#         matched_words = text_set.intersection(offense_words)
#     if len(matched_words) >= count:
#         return 'HOF'
#     else:
#         return 'NOT'

def distance_match(text_set,offence_set,distance):
    count = 0
    for x in text_set:
        if x != "":
            offense_words_matched = [y for y in offence_set if list(y)[0] == list(x)[0] ]
            distance_track = [edit_distance(x,y) for y in offense_words_matched]
            count += len([n for n in distance_track if n <= distance])
    return count
        
def contains_offensive_words(text,count,stem,distance):
    text_set = set(word_tokenize(text))
    text_set_stemmed = set([stemmer.stem(x) for x in word_tokenize(text)])
    if distance == 0:
        if stem:
            matched_words = text_set_stemmed.intersection(offense_words_stemmed)
        else:
            matched_words = text_set.intersection(offense_words)
        if len(matched_words) >= count:
            return 'HOF'
        else:
            return 'NOT'
    else:
        if stem:
            matched_words = text_set_stemmed.intersection(offense_words_stemmed)
            if len(matched_words) >= count:
                return 'HOF'
            else:
                match_count = distance_match(text_set_stemmed,offense_words_stemmed,distance)
        else:
            match_count = distance_match(text_set,offense_words,distance)
            
        if match_count >= count:
            return 'HOF'
        else:
            return 'NOT'

### Predicting only with rules 

In [205]:
y_rules_pred = df_test.apply(lambda row: contains_offensive_words(row.text,1,False,0), axis=1)
df_test['rules-task1'] = y_rules_pred.tolist()
print(classification_report(df_test['task1'], y_rules_pred))

             precision    recall  f1-score   support

        HOF       0.82      0.93      0.87       338
        NOT       0.92      0.82      0.87       370

avg / total       0.88      0.87      0.87       708



### Analysing where svm goes wrong

In [164]:
df_test[['text',]].loc[(df_test['label_t1']==0) & (df_test['svm-task1']==1) ].tolist()
#& (df_test['rules-task1']=='NOT')

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [24]:
#seems like external rules are not effecting results of svm

### Training svn for Task 2

In [25]:
# adding label column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(df_train['task2'].values)

df_train['label_t2'] = le.transform(df_train['task2'].values)
df_test['label_t2'] = le.transform(df_test['task2'].values)

In [26]:
X_train = df_train['text'].values
y_train = df_train['label_t2'].values

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)

SVM_Task2 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)
SVM_Task2.fit(X_train_cv, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
predictions_SVM_Task2 = SVM_Task2.predict_proba(cv.transform(df_test['text'].values))
df_test['svm-task2'] =(predictions_SVM_Task2.tolist())
print(classification_report(df_test['label_t2'],predictions_SVM_Task2))

In [146]:
# low score for 0 and 2 class because of low training instances

### Combining TASK 2 with TASK 1

In [147]:
# in Task 2 results of NONE are almost equivalent to that of on TASK 1,
# so combining both may not provide much improvement

### Combining Task 1 and rules

In [231]:
# getting naive bayes results
model_lr_results =LR_Task1.predict_proba(cv.transform(df_test['text'].values))
# getting rule based results
rule_HOF_results =df_test.apply(lambda row: contains_offensive_words(row.text,3,True,0), axis=1)
rule_NOT_results =df_test.apply(lambda row: contains_offensive_words(row.text,1,True,0), axis=1)

In [232]:
index = 0
threshold_NOT = 0.9
threshold_HOF = 0.9
final_lr_predictions = []

def compare(x,rule_HOF,rule_NOT):
    
    if rule_HOF == 'HOF':
        return 'HOF'
    elif rule_NOT == 'NOT':
        return 'NOT'
#     elif x[0]>threshold_NOT:
#         return 'NOT'
#     elif x[1]>threshold_HOF:
#         return 'HOF'
    else:
        if x[0]>x[1]:
            return 'NOT'
        else:
            return 'HOF'
#         return rule_HOF
        
        
for x in model_lr_results:
    final_lr_predictions.append(compare(x,rule_HOF_results[index],rule_NOT_results[index]))
    index += 1
    
print(classification_report(df_test['task1'], final_lr_predictions)) 

             precision    recall  f1-score   support

        HOF       0.94      0.88      0.91       338
        NOT       0.90      0.95      0.92       370

avg / total       0.92      0.92      0.92       708



In [233]:
df_test['text'].loc[(df_test['label_t1']==1) & (df_test['lr-task1']==0) ].tolist()

['full offense people make video games almost never understand whats enjoyable video games',
 'geterdone definitely judge mouth piece',
 'sick republicans endorsing positions simply appease media left media loves republicans th',
 'americans like excuse killing thousands children middle east draw line getting rid cl',
 'lilbittyya mfers sick asl thats',
 'brennan always looks like hes smelling bad fish hes bitter angry probably bec',
 'bossie devastating false narrative left wing media feed american people two years',
 'shadesofday ppl think nuts say want date marry like dont gotta get married fuckin tomorrow ev',
 'thobykov sick stairs',
 'good defence attorney would tell piss prosecutor',
 'rudy youre either lunatic senile',
 'look things like come not blame stupid handler account see ones retweet lo',
 'going cody nerd haha whats douche bag noel highschool remember guys',
 'drogon worst szn ever nigga brothers died mans mental health shambles',
 'vid pretty liar',
 'turtle youre dum

In [234]:
df_test

Unnamed: 0,tweet_id,text,task1,task2,ID,label_t1,svm-task1,lr-task1,rules-task1
0,1126918111926657024,watching trailer commented see theyre still us...,NOT,NONE,hasoc_2020_en_4930,0,0,0,NOT
1,1123797864319279109,still comparing taylor swift beyonc ever gonna...,NOT,NONE,hasoc_2020_en_1567,0,0,0,NOT
2,1123475024520912899,yall stick damn crying tweets ass honestly fee...,HOF,HATE,hasoc_2020_en_335,1,1,1,HOF
3,1127008771795030017,seems several gotham residents decided holiday...,NOT,NONE,hasoc_2020_en_4127,0,0,0,NOT
4,1123580729366011904,everything youre going right temporary hang th...,NOT,NONE,hasoc_2020_en_3552,0,0,0,NOT
5,1126767414740340736,thanks pic sir mark nabuhay ulit ang yellow he...,NOT,NONE,hasoc_2020_en_2167,0,0,0,NOT
6,1123504804075048960,shit dying,HOF,PRFN,hasoc_2020_en_5266,1,1,1,HOF
7,1123799890159722497,isabella tell,NOT,NONE,hasoc_2020_en_2573,0,0,0,NOT
8,1130090532452130818,theres not one time drink doesnt get spilled f...,NOT,NONE,hasoc_2020_en_231,0,0,0,NOT
9,1123826024867930112,youre going haters youre going lovers onebiglo...,NOT,NONE,hasoc_2020_en_4303,0,0,0,NOT
