# Spam data

In [6]:
import numpy as np
import pandas as pd
import glob

In [6]:
path = r'data/text'
allFiles = glob.glob(path+'/*.csv')
allFiles

['data/text\\Youtube01-Psy.csv',
 'data/text\\Youtube02-KatyPerry.csv',
 'data/text\\Youtube03-LMFAO.csv',
 'data/text\\Youtube04-Eminem.csv',
 'data/text\\Youtube05-Shakira.csv']

### Get files and append them into DataFrame

In [7]:
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col = None, header=0)
    list_.append(df)
frame = pd.concat(list_)
print(df.head())
df = frame.iloc[:,3:5]
print(df.head())

                              COMMENT_ID                              AUTHOR  \
0    z13lgffb5w3ddx1ul22qy1wxspy5cpkz504                          dharma pal   
1      z123dbgb0mqjfxbtz22ucjc5jvzcv3ykj                       Tiza Arellano   
2  z12quxxp2vutflkxv04cihggzt2azl34pms0k  Prìñçeśś Âliś Łøvê Dømíñø Mâđiś™ ﻿   
3      z12icv3ysqvlwth2c23eddlykyqut5z1h                       Eric Gonzalez   
4      z133stly3kete3tly22petvwdpmghrlli                       Analena López   

                         DATE  \
0  2015-05-29T02:30:18.971000   
1  2015-05-29T00:14:48.748000   
2  2015-05-28T21:00:08.607000   
3  2015-05-28T20:47:12.193000   
4  2015-05-28T17:08:29.827000   

                                             CONTENT  CLASS  
0                                         Nice song﻿      0  
1                                      I love song ﻿      0  
2                                      I love song ﻿      0  
3  860,000,000 lets make it first female to reach...      0  
4         

In [13]:
a = df.iloc[:,1:].sum(axis=1)
aa = a.value_counts()
aa

1    1005
0     951
dtype: int64

### Split data to training and testing!

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

x = df['CONTENT']

y = df['CLASS']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=53)

In [None]:
print('train size={} test size={} full data size={}'.format(x_train.shape, y_test.shape, x.shape))

### Generate features from training and testing data

In [None]:
# - create features using countvectorizer
count_vectorizer = CountVectorizer(stop_words='english')

# creates the document term matrix
count_train = count_vectorizer.fit_transform(x_train.values)

count_test = count_vectorizer.transform(x_test.values)

### Train the ML model for a given sensitivity value for naive bayesian classifier

In [None]:
# create Naive Bayes classifier with smoothing parameter
nb_classifier = MultinomialNB(alpha=0.1)

#fit the classifier to training data
nb_classifier.fit(count_train, y_train)

pred = nb_classifier.predict(count_test)
score = metrics.accuracy_score(y_test, pred)

### show some predictions

In [None]:
results = pd.DataFrame({'comment':x_test[:-1].values, 'label':pred})
print(results.head())

# Pipelines

In [4]:
from sklearn.pipeline import Pipeline

# Test of MOR

In [None]:
from sklearn.datasets import load_linnerud
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

linnerud = load_linnerud()

X = linnerud.data
Y = linnerud.target

# to set number of jobs to the number of cores, use n_jobs=-1
a = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=-1).fit(X, Y)

In [None]:
a.predict([[5,3,7]])

In [None]:
a.predict([[5,162,60]])

# MOR for text

In [7]:
data_raw = pd.read_csv('data/data1.csv')
data_raw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
rowSums = data_raw.iloc[:,2:].sum(axis=1)
clean_comments_count = (rowSums==0).sum(axis=0)
print("Total number of comments = ",len(data_raw))
print("Number of clean comments = ",clean_comments_count)
print("Number of comments with labels =",(len(data_raw)-clean_comments_count))

Total number of comments =  159571
Number of clean comments =  143346
Number of comments with labels = 16225


In [9]:
categories = list(data_raw.columns)
categories = categories[2:]
categories

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [10]:
# Calculating number of comments in each category

counts = []
for category in categories:
    counts.append((category, data_raw[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of comments'])
df_stats

Unnamed: 0,category,number of comments
0,toxic,15294
1,severe_toxic,1595
2,obscene,8449
3,threat,478
4,insult,7877
5,identity_hate,1405


In [11]:
label_data = data_raw.iloc[:,2:]
num_label = label_data.sum(axis=1)
# number of comments wrt number of labels for those commments
num_label.value_counts()

0    143346
1      6360
3      4209
2      3480
4      1760
5       385
6        31
dtype: int64

# clean text data with regex

In [12]:
import re
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

# select subdata

In [13]:
print(data_raw.head())
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
82034,db71d3e451de8151,"""\nIt was only likely to get worse. And especi...",0,0,0,0,0,0
78652,d2727d97384b3464,"True\nYerrr, that's probably for the best, to ...",0,0,0,0,0,0
158806,f41a7882a35681d3,I thought he was Jasper Cullen. When was he ev...,0,0,0,0,0,0
138021,e29f5ebdfe58bb7e,"REDIRECT Talk:Swiss immigration referendum, Fe...",0,0,0,0,0,0
107970,4124b0c9c8b5b063,August 2010 (UTC)\nAnd this is coming from a S...,0,0,0,0,0,0
151738,81fe99375bb78dee,"""\n\n Dyersburg historic building pics \n\n pi...",0,0,0,0,0,0
100122,17ddbd6b245cdc3b,"""== Thank you for the renames! ==\n\nIs there ...",0,0,0,0,0,0
134082,cd1b41945a774e55,Regarding edits made during November 9 2006 (U...,0,0,0,0,0,0
57606,9a1faa4e3fa4751c,I request that user page,0,0,0,0,0,0
153227,997087d8c30afbd5,"""\n\nIn order to facilitate reference to talk ...",0,0,0,0,0,0


In [14]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
82034,db71d3e451de8151,it was only likely to get worse and especially...,0,0,0,0,0,0
78652,d2727d97384b3464,true yerrr thats probably for the best to be h...,0,0,0,0,0,0
158806,f41a7882a35681d3,i thought he was jasper cullen when was he eve...,0,0,0,0,0,0
138021,e29f5ebdfe58bb7e,redirect talk swiss immigration referendum feb...,0,0,0,0,0,0
107970,4124b0c9c8b5b063,august utc and this is coming from a starwar...,0,0,0,0,0,0


In [15]:
# split to train and test data
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)

(1400, 8)
(600, 8)


In [16]:
train_text = train['comment_text']
test_text = test['comment_text']


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

feature_t = Pipeline([
    ('feature', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2'))
])
feature_t.fit(train_text)
feature_t.fit(test_text)
x_train = feature_t.transform(train_text)
y_train = train.drop(labels = ['id','comment_text'], axis=1)
model_t = Pipeline([
    ('clf',RandomForestClassifier(max_depth=None,
                                  n_jobs=-1,
                                  n_estimators=82,
                                  min_samples_split=10,
                                  max_features='sqrt'))
    ])
model_t = Pipeline([
    ('clf',OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1))
    ])


In [20]:
# prepare test data
x_test = feature_t.transform(test_text)
y_test = test.drop(labels = ['id','comment_text'], axis=1)
y_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
52735,0,0,0,0,0,0
141775,0,0,0,0,0,0
24943,0,0,0,0,0,0
38366,0,0,0,0,0,0
133592,0,0,0,0,0,0


In [21]:
model_t.fit(x_train, y_train['toxic'])
prediction = model_t.predict(x_test)
print('Test accuracy is {}'.format(accuracy_score(y_test['toxic'], prediction)))

Test accuracy is 0.9116666666666666


In [22]:
prediction.sum()

0

In [23]:
# unleash MORS
mor = MultiOutputRegressor(model_t)
mor.fit(x_train, y_train)

prediction = mor.predict(x_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))


Test accuracy is 0.9033333333333333


In [68]:
print(test_text.head())
print(prediction.shape)
print(prediction.sum(axis=0))
print(y_test.sum())

146665    clear river unidentified deposits in regards t...
80568     lol at all of these idiots who dont know about...
25535     well if it is true that they are the only tech...
103573    unfortunately by all accounts china is a commu...
28435     there would need to be a citable source for su...
Name: comment_text, dtype: object
(600, 6)
[0 0 0 0 0 0]
toxic            63
severe_toxic      4
obscene          35
threat            1
insult           32
identity_hate     6
dtype: int64


In [55]:
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
146665,2fbc68e4fddd6221,clear river unidentified deposits in regards t...,0,0,0,0,0,0
80568,d78bd505d7fef41e,lol at all of these idiots who dont know about...,1,0,1,0,1,0
25535,439b6eb7a58d6e9a,well if it is true that they are the only tech...,0,0,0,0,0,0
103573,2a2b8273d89c7939,unfortunately by all accounts china is a commu...,0,0,0,0,0,0
28435,4b48cc346221c9b1,there would need to be a citable source for su...,0,0,0,0,0,0
