## Handle POS tags using nltk and convert to features via TF-IDF

In [1]:
import pandas as pd

In [2]:
raw_data = pd.read_csv('labeled_data.csv')
raw_data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


### Penn Part-Of-Speech (POS) tagging

In [3]:
import nltk
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anney\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anney\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Anney\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [5]:
def convert_sentence_to_tagged_unigram(sentence):
    #returns a list of tagged words
    text = word_tokenize(sentence)
    text_tags = nltk.pos_tag(text)
    return text_tags

In [6]:
convert_sentence_to_tagged_unigram("Package punkt is already up-to-date!")

[('Package', 'NN'),
 ('punkt', 'NN'),
 ('is', 'VBZ'),
 ('already', 'RB'),
 ('up-to-date', 'JJ'),
 ('!', '.')]

In [10]:
def reconstruct_sentence_to_tags(sentence):
    #returns a "reconstructed sentence" where each token is replaced by its Penn POS tag
    text = word_tokenize(sentence)
    word_tag_pairs = nltk.pos_tag(text)
    tags = []
    for word, tag in word_tag_pairs:
        tags.append(tag)
    reconstructed_sentence = " ".join(tags)
    return reconstructed_sentence

In [8]:
reconstruct_sentence_to_tags("Package punkt is already up-to-date!")

'NN NN VBZ RB JJ .'

In [9]:
#Append POS-tag-strings to original data
pos_tag_sentences = raw_data.apply(lambda row: reconstruct_sentence_to_tags(row.tweet), axis=1)
raw_data = raw_data.assign(pos_tags=pos_tag_sentences.values)
raw_data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,pos_tags
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,. . . NNP NNP RB : IN DT NN PRP MD RB VB IN VB...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,. . . . . NNP JJ NN : NN NNS VBP : NN NN JJ IN...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,. . . . . . . NNP NNP NNP NNP . . . . NNP VBD ...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,. . . . . . . . . NNP NNP NNP : NN VBD PRP VBP...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,. . . . . . . . . . . . . NNP JJ NNS : DT NN P...
...,...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,PRP VBZ DT NN NNP NNP NNP IN NNP CC # CD : CC ...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...","PRP VBP VBN CC VBD DT JJ NN NN , CC VB PRP VB JJ"
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,JJ NN WP VBZ NN . . JJ JJ NN IN PRP VBP JJ NN ...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies,NN VBD JJ NNS VBP PRP NNS


In [11]:
pos_vectorizer = TfidfVectorizer(max_features=5000,min_df=5,max_df=0.75) #ref: https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/src/Automated%20Hate%20Speech%20Detection%20and%20the%20Problem%20of%20Offensive%20Language%20Python%203.6.ipynb

pos = pos_vectorizer.fit_transform(pd.Series(raw_data.post_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

NameError: name 'TfidfVectorizer' is not defined