Goals:
 - Data pipeline for text classification
 - accuracy metrics

In [1]:
# from google.colab import files
# import io  #No need if running on local runtime
import pandas as pd
import string

In [2]:
# uploaded = files.upload()

In [3]:
# raw_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
raw_train = pd.read_csv('train.csv')
raw_train.shape

(159571, 8)

In [4]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [5]:
raw_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
raw_train.comment_text.values

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

- The text that is to be classified is the comment_text column. This needs to be cleaned and processed.
- By looking at the data punctuation, special characters like new line need to be removed.
- EDA needs to be done on statistics of the toxicity labels

In [7]:
raw_train.toxic.values.sum()

15294

In [9]:
# raw_train[raw_train['toxic'] == 1].clean_comment

In [10]:
raw_comments = raw_train.comment_text
raw_comments.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [11]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

In [13]:
def clean_text(text):
    translator = str.maketrans('','',string.punctuation)
    text = word_tokenize(text.translate(translator))
    commentList = [porter.stem(word) for word in text if not word in stop_words]
    return commentList

In [14]:
test_comments = raw_comments[0:2]
print(test_comments)
print(test_comments.apply(clean_text))

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
Name: comment_text, dtype: object
0    [explan, whi, edit, made, usernam, hardcor, me...
1    [daww, He, match, background, colour, Im, seem...
Name: comment_text, dtype: object


In [15]:
import sys
sys.setrecursionlimit(10**6)
clean_comments = raw_comments.apply(clean_text)

In [16]:
clean_comments

0         [explan, whi, edit, made, usernam, hardcor, me...
1         [daww, He, match, background, colour, Im, seem...
2         [hey, man, Im, realli, tri, edit, war, it, guy...
3         [more, I, cant, make, real, suggest, improv, I...
4          [you, sir, hero, ani, chanc, rememb, page, that]
                                ...                        
159566    [and, second, time, ask, view, complet, contra...
159567    [you, asham, that, horribl, thing, put, talk, ...
159568    [spitzer, umm, there, actual, articl, prostitu...
159569    [and, look, like, actual, put, speedi, first, ...
159570    [and, I, realli, dont, think, understand, I, c...
Name: comment_text, Length: 159571, dtype: object

In [17]:
labels = raw_train.columns[2:]

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [23]:
labels_df = raw_train[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(clean_comments, labels_df, test_size=0.2)

In [28]:
y_train.shape

(127656, 6)

In [35]:
x_train[0:2]

55918                       [Oh, ive, watch, articl, quit]
8883     [there, plenti, place, Im, discuss, editor, hi...
Name: comment_text, dtype: object

In [45]:
x_train_doc = [' '.join(x) for x in x_train]
x_val_doc = [' '.join(x) for x in x_val]

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(max_df=0.8,max_features=100)
vectorizer = TfidfVectorizer(max_df=0.8,max_features=140)
# testlist = [['text','tsder', 'text', 'omne']]
# vectorizer.fit_transform(testlist)
x_train_tfidf = vectorizer.fit_transform(x_train_doc)
x_val_tfidf = vectorizer.fit_transform(x_val_doc)

In [135]:
len(x_val_tfidf.data)

279978

In [136]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
clf.fit(x_train_tfidf, y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [137]:
y_pred = clf.predict(x_val_tfidf)

In [138]:
y_pred.shape

(31915, 6)

In [139]:
y_val[1000:1011]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
15681,0,0,0,0,0,0
7699,0,0,0,0,0,0
115165,0,0,0,0,0,0
31083,0,0,0,0,0,0
73519,0,0,0,0,0,0
12806,0,0,0,0,0,0
109594,1,0,0,0,0,0
136967,0,0,0,0,0,0
93387,0,0,0,0,0,0
79019,0,0,0,0,0,0


In [140]:
y_pred[1000:1011]

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0]])

In [141]:
f1_score(y_val, y_pred,average='micro')

0.41936869229117457

F1 score 0.41 is not bad at all

In [143]:
 # predict probabilities
y_pred_prob = clf.predict_proba(x_val_tfidf)
y_pred_prob

array([[0.09268251, 0.00374578, 0.03129628, 0.0004298 , 0.03173093,
        0.00704863],
       [0.07834758, 0.00335003, 0.03690267, 0.00072674, 0.02735494,
        0.00928489],
       [0.08008279, 0.00368462, 0.02347896, 0.0005314 , 0.02767431,
        0.00677088],
       ...,
       [0.01891863, 0.00109521, 0.00741822, 0.0004911 , 0.00676202,
        0.00192623],
       [0.01177092, 0.00026847, 0.00285832, 0.000667  , 0.00479053,
        0.00054196],
       [0.01478734, 0.00084674, 0.00749882, 0.00039578, 0.00877332,
        0.00158278]])

In [150]:
t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
y_pred_new

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [151]:
f1_score(y_val, y_pred_new,average='micro')

0.46010786684024557

F1 score increased to 0.46