In [1]:
import pandas as pd
import numpy as np

In [36]:
test = pd.read_csv('testtrolls.csv')
training = pd.read_csv('trainingtrolls.csv')

In [15]:
training.head()

Unnamed: 0,Insult,Date,Comment,Usage
0,0,,"""THE DRUDGE REPORT\\n\\n\\n\\nYou won't see th...",PublicTest
1,0,20120618222256Z,"""@ian21\xa0""Roger Clemens is the fucking man, ...",PublicTest
2,1,20120618213617Z,"""Agree with Alan you are an extremest idiot. ...",PublicTest
3,0,,"""Really?\\n\\nI see Marc Lamont Hill on variou...",PrivateTest
4,0,20120620003825Z,"""Really suck isn't the word, when many of our ...",PrivateTest


In [3]:
# parse 'some degree' of html text, like removing quotes, double \\ etc.
from bs4 import BeautifulSoup

training['Comment'] = training['Comment'].apply(lambda text: BeautifulSoup(text, 'html.parser'))

In [67]:
import re

EMAIL_REGEX = r'[\w\-\.\+]+\@[a-zA-Z0-9\.\-]+\.[a-zA-z0-9]{2,4}'
re.sub(EMAIL_REGEX, '_EM', "test anc@mail.com 123")

'test _EM 123'

In [92]:
URL_REGEX = r'http(s)?:\/\/\S+' # only works for ones prepended by http(s)://
re.sub(URL_REGEX, '_U', "test http://mail.com https://mail.com 123")

'test _U _U 123'

In [93]:
# format whitespaces, line breaks and quotes

line = " \" \n \\n test - _ \ '"

line = line.replace('"', ' ')
line = line.replace('_', ' ')
line = line.replace('-', ' ')
line = line.replace('\n', ' ')
line = line.replace('\\n', ' ')
line = line.replace('\'', ' ')
line = line.replace('\\', ' ')
line = re.sub(' +',' ', line)

line

' test '

In [137]:
# manage punctuation
# ([^!\?]) matches not ! or ?
# (\Z|[^!\?]) matches end of line or not (! or ?)
# r'\1 _BQ\n\3' replaces the middle matching part with _BQ
line = "... test what?!? you????  ... test aaaaaaagh abc.def help? me!"

line = re.sub(r'([^!\?])(\?{2,})(\Z|[^!\?])', r'\1 _BQ\n\3', line) # replace what??? to what _BQ\n
line = re.sub(r'([^\.])(\.{2,})', r'\1 _SS\n', line) # replace  uh... to uh _SS\n
line = re.sub(r'([^!\?])(\?|!){2,}(\Z|[^!\?])', r'\1 _BX\n\3', line) # replace what!? to what _BX\n 
line = re.sub(r'([^!\?])\?(\Z|[^!\?])', r'\1 _Q\n\2', line) # replace is it? with is it _Q\n
line = re.sub(r'([^!\?])!(\Z|[^!\?])', r'\1 _X\n\2', line) # replace great! with great _X\n
line = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2 _EL', line) # replace aaaaaaagh(at least 3 repeating letters) to aagh _EL
line = re.sub(r'(\w+)\.(\w+)', r'\1\2', line) # replace abc.def with abcdef

line

'... test what _BX\n you _BQ\n   _SS\n test aagh _EL abcdef help _Q\n me _X\n'

In [151]:
text = "hey #%**&$ you"

# swearing
text = re.sub(r'([#%&\*\$]{2,})(\w*)', r'\1\2 _SW', text) # add _SW after sequence of #%$&* (assuming ! and ? are already removed, and not including @)

text

'hey #%**&$ _SW you'

In [206]:
text = "hi x-}} => <3 =(( :("

# emotes
# note: (?:xxxx){n,} meaning any of which with count n+
# head: any of 8 x ; : =
# nose(optional): -
# mouth(happy): any of ) ] } >
# mouth(sad):   any of ( [ | \ / { < 

text = re.sub(r' [8x;:=]-?(?:\)|\}|\]|>){2,}', r' _BS', text) # with 2+ mouth symbols
text = re.sub(r' ([8x;:=]-?[\)\}\]|>])|(?:<3)', r' _S', text) # with 1 mouth OR heart symbol, ie.<3
text = re.sub(r' [8x;:=]-?(?:\(|\[|\||\\|/|\{|<){2,}', r' _BF', text) # wih 2+ mouth symbols
text = re.sub(r' [8x;:=]-?[\(\[\(|\[|\||\\|/|\{|<]', r' _F', text) # with 1 mouth

text

'hi _BS _S  _S _BF _F'

In [207]:
# remove number and percentages
line = "it is 100%"

line = re.sub('[1|2|3|4|5|6|7|8|9|0]', '', line)
line = re.sub('[%]', '', line)

line

'it is '

In [228]:
# split into phrases and words
line = "Hi; welcome \n hello. See this: amazing sentence (don't you agree) 100% wholesome&healthy! .  . "

phrases = re.split(r'[;:\.()\n]', line)
phrases = [re.findall(r'[\w%\*&#]+', ph) for ph in phrases] # select words (words may include symbols except ! and ?)
phrases = [ph for ph in phrases if ph] # remove empty arrays inside phrases; note: empty array is falsey

phrases

[['Hi'],
 ['welcome'],
 ['hello'],
 ['See', 'this'],
 ['amazing', 'sentence'],
 ['don', 't', 'you', 'agree'],
 ['100%', 'wholesome&healthy']]

In [226]:
# flatten phrases into a single list

words = []
[words.extend(ph) for ph in phrases]

words

['Hi',
 'welcome',
 'hello',
 'See',
 'this',
 'amazing',
 'sentence',
 'don',
 't',
 'you',
 'agree',
 '100%',
 'wholesome&healthy']

In [231]:
# stringing consecutive single letters together
words = ["h", "e", "l", "l", "o", "world"]

tmp = words
words = []
new_word = ''
for word in tmp:
    if len(word) == 1: # keep adding consecutive single letters to new_word until the next word is not a single letter
        new_word = new_word + word
    else:
        if new_word:
            words.append(new_word)
            new_word = ''
        words.append(word)
        
words 

['hello', 'world']

In [242]:
from nltk.corpus import stopwords, wordnet

words = re.findall(r'[\w%\*&#]+', "the rain in Madrid of Spain makes him a elated boy")

words = [w for w in words if not w in stopwords.words("english")]

words

['rain', 'Madrid', 'Spain', 'makes', 'elated', 'boy']

In [16]:
# ngram tagger of word's part of speech
import nltk
from nltk import NgramTagger
# nltk.download('brown')

# Backoff tagging
brown_a = nltk.corpus.brown.tagged_sents()
tagger = None
# backoff sets the fallback if fails to tag; 
# since this is inside a loop, 
# the 4-gram will fall back on the 3-gram; 
# 3 on 2 and so on, until 1-gram will fallback to None
for n in range(1,4):
    tagger = NgramTagger(n, brown_a, backoff = tagger) 

In [71]:
words = ['silly', 'dogs', 'happily', 'jumped', 'over', 'the', 'reddish', 'foxes']
words = tagger.tag(words)
words

[('silly', 'JJ'),
 ('dogs', 'NNS'),
 ('happily', 'RB'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'AT'),
 ('reddish', 'JJ'),
 ('foxes', None)]

In [80]:
from nltk import pos_tag
from nltk.corpus import wordnet
words = ['silly', 'dogs', 'happily', 'jumped', 'over', 'the', 'reddish', 'foxes']

words = pos_tag(words)

def get_wordnet_pos(treebank_tag):
    if treebank_tag == None:
        return None
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    

words = list(map(lambda word_pos: (word_pos[0], get_wordnet_pos(word_pos[1])), words))
words

[('silly', 'r'),
 ('dogs', 'n'),
 ('happily', 'r'),
 ('jumped', 'v'),
 ('over', None),
 ('the', None),
 ('reddish', 'a'),
 ('foxes', 'n')]

In [78]:
# stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ['silly', 'dogs', 'jumped', 'over', 'the', 'reddish', 'foxes']
list(map(lambda word: stemmer.stem(word), words))

['silli', 'dog', 'jump', 'over', 'the', 'reddish', 'fox']

In [81]:
# lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized = []
for word, pos in words: 
    if pos:
        word = lemmatizer.lemmatize(word, pos = pos)
    lemmatized.append(word)
        
lemmatized

['silly', 'dog', 'happily', 'jump', 'over', 'the', 'reddish', 'fox']

In [44]:
# together

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

def clean_line(line):
#     line = str(BeautifulSoup(str(line), "html.parser"))
    line = BeautifulSoup(line).get_text()
    line = line.lower()

    EMAIL_REGEX = r'[\w\-\.\+]+\@[a-zA-Z0-9\.\-]+\.[a-zA-z0-9]{2,4}'
    line = re.sub(EMAIL_REGEX, '_EM', line)

    URL_REGEX = r'http(s)?:\/\/\S+' # only works for ones prepended by http(s)://
    line = re.sub(URL_REGEX, '_U', line)

    # format whitespaces, line breaks and quotes
    line = line.replace('"', ' ')
    line = line.replace('_', ' ')
    line = line.replace('-', ' ')
    line = line.replace('\n', ' ')
    line = line.replace('\\n', ' ')
    line = line.replace('\'', ' ')
    line = re.sub(' +',' ', line)
    
    print(line)
    # manage punctuation
    line = re.sub(r'([^!\?])(\?{2,})(\Z|[^!\?])', r'\1 _BQ\n\3', line) # replace what??? to what _BQ\n
    line = re.sub(r'([^\.])(\.{2,})', r'\1 _SS\n', line) # replace  uh... to uh _SS\n
    line = re.sub(r'([^!\?])(\?|!){2,}(\Z|[^!\?])', r'\1 _BX\n\3', line) # replace what!? to what _BX\n 
    line = re.sub(r'([^!\?])\?(\Z|[^!\?])', r'\1 _Q\n\2', line) # replace is it? with is it _Q\n
    line = re.sub(r'([^!\?])!(\Z|[^!\?])', r'\1 _X\n\2', line) # replace great! with great _X\n
    line = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2 _EL', line) # replace aaaaaaagh(at least 3 repeating letters) to aagh _EL
    line = re.sub(r'(\w+)\.(\w+)', r'\1\2', line) # replace abc.def with abcdef

    # swearing
    line = re.sub(r'([#%&\*\$]{2,})(\w*)', r'\1\2 _SW', line) # add _SW after sequence of #%$&* (assuming ! and ? are already removed, and not including @)

    # emotes
    # note: (?:xxxx){n,} meaning any of which with count n+
    # head: any of 8 x ; : =
    # nose(optional): -
    # mouth(happy): any of ) ] } >
    # mouth(sad):   any of ( [ | \ / { < 
    line = re.sub(r' [8x;:=]-?(?:\)|\}|\]|>){2,}', r' _BS', line) # with 2+ mouth symbols
    line = re.sub(r' ([8x;:=]-?[\)\}\]|>])|(?:<3)', r' _S', line) # with 1 mouth OR heart symbol, ie.<3
    line = re.sub(r' [8x;:=]-?(?:\(|\[|\||\\|/|\{|<){2,}', r' _BF', line) # wih 2+ mouth symbols
    line = re.sub(r' [8x;:=]-?[\(\[\(|\[|\||\\|/|\{|<]', r' _F', line) # with 1 mouth

    # remove number and percentages; and '\'
    line = re.sub('[1|2|3|4|5|6|7|8|9|0]', '', line)
    line = re.sub('[%]', '', line)
    line = line.replace('\\', ' ')

    # split into phrases and words
    phrases = re.split(r'[;:\.()\n]', line)
    phrases = [re.findall(r'[\w%\*&#]+', ph) for ph in phrases] # select words (words may include symbols except ! and ?)
    phrases = [ph for ph in phrases if ph] # remove empty arrays inside phrases; note: empty array is falsey

    # flatten phrases into a single list
    words = []
    [words.extend(ph) for ph in phrases]

    # stringing consecutive single letters together
    tmp = words
    words = []
    new_word = ''
    for word in tmp:
        if len(word) == 1: # keep adding consecutive single letters to new_word until the next word is not a single letter
            new_word = new_word + word
        else:
            if new_word:
                words.append(new_word)
                new_word = ''
            words.append(word)

    # remove common words, defined in stopwords by NLTK
    words = [w for w in words if not w in stopwords.words("english")]
    
    # tag part-of-speech
    words = pos_tag(words)
    words = list(map(lambda word_pos: (word_pos[0], get_wordnet_pos(word_pos[1])), words))

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word, pos in words: 
        if pos:
            word = lemmatizer.lemmatize(word, pos = pos)
        lemmatized.append(word)
    
    return lemmatized

def get_wordnet_pos(treebank_tag):
    if treebank_tag == None:
        return None
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [45]:
clean_line("'<b>I am 100% sure that this is a scam!!!!</b> ;('")

 i am 100% sure that this is a scam!!!! ;( 


['sure', 'scam', '_BX', '_F']

In [27]:
clean_line("the dogs lazily lay on the green fields :)")

the dogs lazily lay on the green fields :)


['dog', 'lazily', 'lay', 'green', 'field', '_S']

In [None]:
train_x = training['Comment'].apply(clean_line)
test_x = test['Comment'].apply(clean_line)

In [47]:
print(train_x[0:5], test_x[0:5])

0    [drudge, report, see, story, foxfag, forum, su...
1    [ian, xa, roger, clemens, fuck, man, never, fu...
2    [agree, alan, extreme, idiot, american, native...
3    [really, _Q, see, marc, lamont, hill, various,...
4    [really, suck, word, many, nuclear, power, pla...
Name: Comment, dtype: object 0    [gallup, daily, may, u, update, daily, pm, et,...
1    [someone, whose, self, importance, get, best, ...
2                                       [stand, porch]
3             [camp, get, come, get, guy, earn, money]
4    [could, wrong, american, tea, party, stand, li...
Name: Comment, dtype: object


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

# stop words are already removed
# swds = stopwords.words('english')
vect = TfidfVectorizer(
    analyzer = "word",
    ngram_range = (1,3), 
    min_df = 0, 
    stop_words = None, #swds, 
    max_features=5000
)

docs_new = [" ".join(x) for x in train_x]
tf = vect.fit_transform(docs_new).toarray() # create bag of words model

In [49]:
np.savetxt('train_x.csv', tf, delimiter=',')

In [50]:
docs_new = [" ".join(x) for x in test_x]
tf = vect.fit_transform(docs_new).toarray() # create bag of words model
np.savetxt('test_x.csv', tf, delimiter=',')

In [None]:
# training

In [2]:
train_x = np.genfromtxt('train_x.csv', delimiter = ',')
test_x = np.genfromtxt('test_x.csv', delimiter = ',')
# for y 
training = pd.read_csv('trainingtrolls.csv')
test = pd.read_csv('testtrolls.csv')
train_y = training["Insult"]
test_y = test["Insult"]

In [3]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 2000, max_depth = 5, max_features = 1000)

rf.fit(train_x, train_y)
y_submission = rf.predict(test_x)

In [4]:
# y_submission_norm = (y_submission - y_submission.min())/(y_submission.max() - y_submission.min())
y_submission_bool = (y_submission >= 0.5).astype(int)

In [5]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_submission_bool, test_y)
roc_auc = auc(fpr, tpr)
print("Random Forest benchmark AUC, 1000 estimators")
print(roc_auc)

Random Forest benchmark AUC, 1000 estimators
0.7737747391969321


In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_test = train_y
y_pred = (rf.predict(train_x) >= 0.5).astype(int)
print('Training')
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

print("\n")

y_test = test_y
y_pred = (rf.predict(test_x) >= 0.5).astype(int)
print('Testing')
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))



Training
Precision: 0.926
Recall: 0.254
F1: 0.399


Testing
Precision: 0.803
Recall: 0.057
F1: 0.106


In [7]:
print("Random Forest benchmark score, 1000 estimators")
print(rf.score(test_x, test_y))
# very bad correlation, most likely because of the low recall

Random Forest benchmark score, 1000 estimators
0.012422375995523207


In [13]:
# using SVC as classifier
from sklearn.svm import LinearSVC
clf = LinearSVC(C=1000, max_iter=50000)
clf.fit(train_x, train_y)
y_submission = clf.predict(test_x)

In [15]:
# y_submission_norm = (y_submission - y_submission.min())/(y_submission.max() - y_submission.min())
y_submission_bool = (y_submission >= 0.5).astype(int)


from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_submission_bool, test_y)
roc_auc = auc(fpr, tpr)
print("Random Forest benchmark AUC, 1000 estimators")
print(roc_auc)

from sklearn.metrics import precision_score, recall_score, f1_score

y_test = train_y
y_pred = (rf.predict(train_x) >= 0.5).astype(int)
print('Training')
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

print("\n")

y_test = test_y
y_pred = (rf.predict(test_x) >= 0.5).astype(int)
print('Testing')
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

print("Random Forest benchmark score, 1000 estimators")
print(rf.score(test_x, test_y))
# very bad correlation, most likely because of the low recall


Random Forest benchmark AUC, 1000 estimators
0.5134523561892516
Training
Precision: 0.926
Recall: 0.254
F1: 0.399


Testing
Precision: 0.803
Recall: 0.057
F1: 0.106
Random Forest benchmark score, 1000 estimators
0.012422375995523207
