In [145]:
import pandas as pd

In [146]:
dataframe = pd.read_json("controversial-comments.jsonl", lines=True)

In [147]:
dataframe = dataframe[:50]

In [148]:
dataframe.head()

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...


a) Convert all text to lowercase

In [149]:
dataframe['txt'] = dataframe['txt'].str.lower()

In [150]:
dataframe.head()

Unnamed: 0,con,txt
0,0,well it's great that he did something about th...
1,0,you are right mr. president.
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


b) Remove all punctuation

In [151]:
dataframe.txt =  dataframe['txt'].str.replace(r'[^\w\s]', '')

In [152]:
dataframe.head()

Unnamed: 0,con,txt
0,0,well its great that he did something about tho...
1,0,you are right mr president
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


c) Remove stop words

In [153]:
from nltk.corpus import stopwords

In [154]:
#import nltk

In [155]:
#nltk.download("stopwords")

In [156]:
from nltk.tokenize import word_tokenize

In [157]:
#nltk.download('punkt')

In [158]:
tokenized_words = dataframe['txt'].apply(word_tokenize)

In [159]:
stop_words = stopwords.words('english')

In [160]:
tokenized_words = tokenized_words.apply(lambda tokenized_words: [item for item in tokenized_words if item not in stop_words])

In [161]:
dataframe['txt'] = tokenized_words

In [162]:
tokenized_words

0     [well, great, something, beliefs, office, doub...
1                                [right, mr, president]
2     [given, input, apart, saying, wrong, argument,...
3     [get, frustration, reason, want, way, foundati...
4     [far, expert, tpp, would, tend, agree, lot, pr...
5     [thanks, playing, feel, like, nowhttpsthenypos...
6                                             [deleted]
7     [cant, racist, black, friend, lololol, vast, m...
8     [nope, youre, right, smoke, smoke, bad, lungs,...
9     [ltthats, exactly, means, especially, power, t...
10    [us, isnt, canada, uk, current, legal, mechani...
11    [meaningless, words, keep, fire, contained, he...
12    [obama, declare, dictator, life, honestly, wou...
13    [classic, case, us, government, department, in...
14    [community, organizer, supported, redistributi...
15                         [stop, crying, unattractive]
16    [believe, good, time, invoke, httpswwwredditco...
17       [explain, death, threats, obama, got, e

d) Apply NLTK’s PorterStemmer.

In [163]:
from nltk.stem.porter import PorterStemmer

In [164]:
porter = PorterStemmer()

In [165]:
def stem_list(row): 
    my_list = row['txt'] 
    stemmed_list = [porter.stem(word) for word in my_list] 
    return (stemmed_list)
dataframe['txt'] = dataframe.apply(stem_list, axis=1)

In [166]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [167]:
dataframe['txt'] = dataframe['txt'].apply(lambda x:' '.join(x))

In [168]:
text = np.array(dataframe['txt'])

In [169]:
count = CountVectorizer()

In [170]:
print(text)

['well great someth belief offic doubt trump would fight un im realli realli happi obama someth couldoh wait'
 'right mr presid' 'given input apart say wrong argument clearli'
 'get frustrat reason want way foundat complex problem advanc grade get decent grade sat type test math dont realli understand lot mathemat way get right answer lot time figur common sens work around lot question would ill prepar take colleg level math cours despit averag math score theyr tri bust kid ball'
 'far expert tpp would tend agre lot problem understand obama push creat econom bulwark china pacif administr recogn increas strength matur bellicos see south china sea recogn us alli push increas penetr mani emerg market would otherwis natur tend align china alway thought curiou mani critiqu tpp hardli saw mention obama would push hardli track record someon tri railroad worker corpor interest must someth felt huge import use much polit capit someth like tpp mayb tpp could better certainli seem yet mayb best o

In [171]:
bag_of_words = count.fit_transform(text)


In [172]:
bag_of_words


<50x633 sparse matrix of type '<class 'numpy.longlong'>'
	with 853 stored elements in Compressed Sparse Row format>

In [173]:
bag_of_words.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [174]:
text

array(['well great someth belief offic doubt trump would fight un im realli realli happi obama someth couldoh wait',
       'right mr presid', 'given input apart say wrong argument clearli',
       'get frustrat reason want way foundat complex problem advanc grade get decent grade sat type test math dont realli understand lot mathemat way get right answer lot time figur common sens work around lot question would ill prepar take colleg level math cours despit averag math score theyr tri bust kid ball',
       'far expert tpp would tend agre lot problem understand obama push creat econom bulwark china pacif administr recogn increas strength matur bellicos see south china sea recogn us alli push increas penetr mani emerg market would otherwis natur tend align china alway thought curiou mani critiqu tpp hardli saw mention obama would push hardli track record someon tri railroad worker corpor interest must someth felt huge import use much polit capit someth like tpp mayb tpp could better ce

In [175]:
count.get_feature_names()


['100',
 '19th',
 '2008',
 '247',
 'abl',
 'abus',
 'account',
 'acquiesc',
 'action',
 'administr',
 'advanc',
 'agre',
 'ahead',
 'aid',
 'air',
 'align',
 'alli',
 'allow',
 'almost',
 'alreadi',
 'also',
 'alway',
 'american',
 'answer',
 'antitrump',
 'anybodi',
 'anyth',
 'apart',
 'appear',
 'appointe',
 'arehav',
 'arent',
 'argument',
 'arnold',
 'around',
 'ascend',
 'ask',
 'ass',
 'assembl',
 'associ',
 'attempt',
 'attitud',
 'authoritarian',
 'automat',
 'averag',
 'away',
 'awaygt',
 'back',
 'bad',
 'ball',
 'barack',
 'begin',
 'behav',
 'behavior',
 'behind',
 'belief',
 'believ',
 'bellicos',
 'benefit',
 'best',
 'better',
 'big',
 'bin',
 'black',
 'blitzer',
 'boo',
 'bot',
 'brain',
 'britain',
 'broader',
 'bulwark',
 'bust',
 'cabinet',
 'california',
 'call',
 'came',
 'camp',
 'campus',
 'canada',
 'candidaci',
 'cant',
 'capit',
 'carcinogen',
 'care',
 'case',
 'centuri',
 'certainli',
 'characterist',
 'cheer',
 'chees',
 'china',
 'citigroup',
 'class',
 

In [176]:
from nltk import pos_tag
from nltk import word_tokenize

In [177]:
text_tags = []

In [178]:
import nltk

In [179]:
#nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hedyeherfani/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [180]:
for comments in text:
    tag_text = nltk.pos_tag(word_tokenize(comments))
    text_tags.append([tag for word, tag in tag_text])

In [181]:
print(text_tags)

[['RB', 'JJ', 'JJ', 'NN', 'JJ', 'NN', 'NN', 'MD', 'VB', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'VBZ', 'JJ', 'NN'], ['JJ', 'NN', 'NN'], ['VBN', 'JJ', 'RB', 'VBP', 'JJ', 'NN', 'NN'], ['VB', 'JJ', 'NN', 'VBP', 'NN', 'NN', 'NN', 'NN', 'NN', 'VBD', 'VB', 'JJ', 'NN', 'VBD', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'FW', 'NN', 'VB', 'JJ', 'NN', 'NN', 'NN', 'JJ', 'JJ', 'NNS', 'VBP', 'RB', 'JJ', 'NN', 'MD', 'VB', 'JJ', 'VB', 'JJ', 'NN', 'NN', 'NNS', 'VBP', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'VB', 'NN'], ['RB', 'JJ', 'NN', 'MD', 'VB', 'NN', 'NN', 'NN', 'VBP', 'NN', 'NN', 'NN', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'JJ', 'NN', 'NN', 'NN', 'VBP', 'JJ', 'JJ', 'NN', 'VBP', 'PRP', 'JJ', 'NN', 'NNS', 'VBP', 'JJ', 'NN', 'NN', 'MD', 'VB', 'JJ', 'JJ', 'NN', 'VBZ', 'RB', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'VBD', 'NN', 'NN', 'MD', 'VB', 'JJ', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'MD', 'VB', 'VBD', 'JJ', 'NN', 'NN', 'JJ', 'NN', 'NN', 'VBP', 'IN', 'NN', 'NN', 'NN', 'MD', 'VB', 'NN', 'VBP', 'RB', 'VB

In [182]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(text_tags)

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0]])

In [183]:
dataframe.columns

Index(['con', 'txt'], dtype='object')

In [184]:
nltk.pos_tag(text)


[('well great someth belief offic doubt trump would fight un im realli realli happi obama someth couldoh wait',
  'NN'),
 ('right mr presid', 'NN'),
 ('given input apart say wrong argument clearli', 'NN'),
 ('get frustrat reason want way foundat complex problem advanc grade get decent grade sat type test math dont realli understand lot mathemat way get right answer lot time figur common sens work around lot question would ill prepar take colleg level math cours despit averag math score theyr tri bust kid ball',
  'NN'),
 ('far expert tpp would tend agre lot problem understand obama push creat econom bulwark china pacif administr recogn increas strength matur bellicos see south china sea recogn us alli push increas penetr mani emerg market would otherwis natur tend align china alway thought curiou mani critiqu tpp hardli saw mention obama would push hardli track record someon tri railroad worker corpor interest must someth felt huge import use much polit capit someth like tpp mayb tpp c

In [185]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [186]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text)

In [187]:
feature_matrix

<50x633 sparse matrix of type '<class 'numpy.float64'>'
	with 853 stored elements in Compressed Sparse Row format>

In [188]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.16659322, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [189]:
tfidf.vocabulary_


{'well': 611,
 'great': 246,
 'someth': 525,
 'belief': 55,
 'offic': 403,
 'doubt': 164,
 'trump': 586,
 'would': 624,
 'fight': 214,
 'un': 591,
 'im': 287,
 'realli': 465,
 'happi': 261,
 'obama': 402,
 'couldoh': 122,
 'wait': 606,
 'right': 487,
 'mr': 383,
 'presid': 443,
 'given': 237,
 'input': 296,
 'apart': 27,
 'say': 499,
 'wrong': 626,
 'argument': 32,
 'clearli': 95,
 'get': 235,
 'frustrat': 229,
 'reason': 466,
 'want': 607,
 'way': 608,
 'foundat': 225,
 'complex': 105,
 'problem': 449,
 'advanc': 10,
 'grade': 245,
 'decent': 137,
 'sat': 496,
 'type': 588,
 'test': 560,
 'math': 358,
 'dont': 163,
 'understand': 594,
 'lot': 343,
 'mathemat': 359,
 'answer': 23,
 'time': 575,
 'figur': 215,
 'common': 101,
 'sens': 508,
 'work': 621,
 'around': 34,
 'question': 457,
 'ill': 286,
 'prepar': 442,
 'take': 551,
 'colleg': 99,
 'level': 328,
 'cours': 123,
 'despit': 146,
 'averag': 44,
 'score': 502,
 'theyr': 567,
 'tri': 581,
 'bust': 71,
 'kid': 314,
 'ball': 49,
 'f

a) Word count vector can be helpful in a spam model. If we are building a model to detect spam, we would use multiple emails as reference. If certain keywords are continuously seen in emails we assume to be spam, the model can then start filtering emails as being spam based on the presence of these keywords. For example, if we input thousands of spam emails and most of them include the word "prince", the model then can learn that the presence of the word "prince" might entail a future email to be spam.
b) Part of speech tag might be helpful for a program that grammatically checks text. If the program notes that a sentence is lacking the presence of "NN" (noun), it could flag that sentence as being grammatically incorrect. 
c) Term frequency-inverse document frequency could be used to look at sentiment. If a survery were to be sent out to a large portion of the poulation, to get their viewpoint on a concept, we could use this to see what the sentiment tends to be. If the survery were regarding a movie, and the most frequent word was "good", then we could understand that the general consensus towards the movie is positive. 