# Loading data

In [0]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
rawData = pd.read_csv("comments.csv")
rawData.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


# Pre-processing text data

In [0]:
data = rawData[["target", "comment_text"]]
data.head()

Unnamed: 0,target,comment_text
0,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ..."
1,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon..."
2,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!
3,0.0,Is this something I'll be able to install on my site? When will you be releasing it?
4,0.893617,haha you guys are a bunch of losers.


## 1. remove punctuation

In [0]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
import string
string.punctuation

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['comment_text_clean'] = data['comment_text'].apply(lambda x: remove_punct(x))

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,target,comment_text,comment_text_clean
0,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ...",This is so cool Its like would you want your mother to read this Really great idea well done
1,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon...",Thank you This would make my life a lot less anxietyinducing Keep it up and dont let anyone get ...
2,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,This is such an urgent design problem kudos to you for taking it on Very impressive
3,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,Is this something Ill be able to install on my site When will you be releasing it
4,0.893617,haha you guys are a bunch of losers.,haha you guys are a bunch of losers


## 2. Tokenization

In [0]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens
data['comment_text_tokenized'] = data['comment_text_clean'].apply(lambda x: tokenize(x.lower()))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,target,comment_text,comment_text_clean,comment_text_tokenized
0,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ...",This is so cool Its like would you want your mother to read this Really great idea well done,"[this, is, so, cool, its, like, would, you, want, your, mother, to, read, this, really, great, i..."
1,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon...",Thank you This would make my life a lot less anxietyinducing Keep it up and dont let anyone get ...,"[thank, you, this, would, make, my, life, a, lot, less, anxietyinducing, keep, it, up, and, dont..."
2,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,This is such an urgent design problem kudos to you for taking it on Very impressive,"[this, is, such, an, urgent, design, problem, kudos, to, you, for, taking, it, on, very, impress..."
3,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,Is this something Ill be able to install on my site When will you be releasing it,"[is, this, something, ill, be, able, to, install, on, my, site, when, will, you, be, releasing, it]"
4,0.893617,haha you guys are a bunch of losers.,haha you guys are a bunch of losers,"[haha, you, guys, are, a, bunch, of, losers]"


## 3. Remove stopwords

In [0]:
import nltk
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')
stopword[0:100]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once']

In [0]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text
data['comment_text_nostop'] = data['comment_text_tokenized'].apply(lambda x: remove_stopwords(x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,target,comment_text,comment_text_clean,comment_text_tokenized,comment_text_nostop
0,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ...",This is so cool Its like would you want your mother to read this Really great idea well done,"[this, is, so, cool, its, like, would, you, want, your, mother, to, read, this, really, great, i...","[cool, like, would, want, mother, read, really, great, idea, well, done]"
1,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon...",Thank you This would make my life a lot less anxietyinducing Keep it up and dont let anyone get ...,"[thank, you, this, would, make, my, life, a, lot, less, anxietyinducing, keep, it, up, and, dont...","[thank, would, make, life, lot, less, anxietyinducing, keep, dont, let, anyone, get, way]"
2,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,This is such an urgent design problem kudos to you for taking it on Very impressive,"[this, is, such, an, urgent, design, problem, kudos, to, you, for, taking, it, on, very, impress...","[urgent, design, problem, kudos, taking, impressive]"
3,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,Is this something Ill be able to install on my site When will you be releasing it,"[is, this, something, ill, be, able, to, install, on, my, site, when, will, you, be, releasing, it]","[something, ill, able, install, site, releasing]"
4,0.893617,haha you guys are a bunch of losers.,haha you guys are a bunch of losers,"[haha, you, guys, are, a, bunch, of, losers]","[haha, guys, bunch, losers]"


## 4. _Lemmatizing_ or Stemming

In [0]:
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:

wn.lemmatize("ate")


'ate'

In [0]:
ps.stem("ate")

'ate'

In [0]:
import nltk
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['comment_text_lemmatized'] = data['comment_text_nostop'].apply(lambda x: lemmatizing(x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,target,comment_text,comment_text_clean,comment_text_tokenized,comment_text_nostop,comment_text_lemmatized
0,0.0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well ...",This is so cool Its like would you want your mother to read this Really great idea well done,"[this, is, so, cool, its, like, would, you, want, your, mother, to, read, this, really, great, i...","[cool, like, would, want, mother, read, really, great, idea, well, done]","[cool, like, would, want, mother, read, really, great, idea, well, done]"
1,0.0,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyon...",Thank you This would make my life a lot less anxietyinducing Keep it up and dont let anyone get ...,"[thank, you, this, would, make, my, life, a, lot, less, anxietyinducing, keep, it, up, and, dont...","[thank, would, make, life, lot, less, anxietyinducing, keep, dont, let, anyone, get, way]","[thank, would, make, life, lot, le, anxietyinducing, keep, dont, let, anyone, get, way]"
2,0.0,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,This is such an urgent design problem kudos to you for taking it on Very impressive,"[this, is, such, an, urgent, design, problem, kudos, to, you, for, taking, it, on, very, impress...","[urgent, design, problem, kudos, taking, impressive]","[urgent, design, problem, kudos, taking, impressive]"
3,0.0,Is this something I'll be able to install on my site? When will you be releasing it?,Is this something Ill be able to install on my site When will you be releasing it,"[is, this, something, ill, be, able, to, install, on, my, site, when, will, you, be, releasing, it]","[something, ill, able, install, site, releasing]","[something, ill, able, install, site, releasing]"
4,0.893617,haha you guys are a bunch of losers.,haha you guys are a bunch of losers,"[haha, you, guys, are, a, bunch, of, losers]","[haha, guys, bunch, losers]","[haha, guy, bunch, loser]"


In [0]:
#for stemming

#ps = nltk.PorterStemmer()
#def stemming(tokenized_text):
#    text = [ps.stem(word) for word in tokenized_text]
#    return text

#data['comment_text_stemmed'] = data['comment_text_nostop'].apply(lambda x: stemming(x))

## 5. Save pre-processed data

In [0]:
export_csv = data.to_csv (r'pre_processed_comments.csv', index = None, header=True)