## Importando bibliotecas

In [1]:
import sklearn
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Importando Data

In [2]:
# training data
train = pd.read_csv("./train.csv")

# test data
test = pd.read_csv("./test.csv")

## Explorando o Dataset

In [3]:
train.head(20)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [4]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [5]:
# Count non hate speech related tweets
sum(train["label"] == 0)

29720

In [6]:
# Count hate speech related tweets
sum(train["label"] == 1)

2242

In [14]:
sum(train["label"] != 2)

31962

In [7]:
# check if there are any missing values
train.isnull().sum()
#train.isnull().values.any()

id       0
label    0
tweet    0
dtype: int64

## Limpeza dos Dados

In [8]:
#install tweet-preprocessor to clean tweets
!pip install tweet-preprocessor



In [9]:
# remove special characters using the regular expression library
import re

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [10]:
import preprocessor as p

# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

In [11]:
# clean training data
train_clean_tweet = clean_tweets(train["tweet"])
train_clean_tweet = pd.DataFrame(train_clean_tweet)

In [12]:
# append cleaned tweets to the training data
train["clean_tweet"] = train_clean_tweet

# compare the cleaned and uncleaned tweets
train.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
5,6,0,[2/2] huge fan fare and big talking before the...,2 2 huge fan fare and big talking before they ...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so


In [13]:
# clean the test data and append the cleaned tweets to the test data
test_clean_tweet = clean_tweets(test["tweet"])
test_clean_tweet = pd.DataFrame(test_clean_tweet)
# append cleaned tweets to the training data
test["clean_tweet"] = test_clean_tweet

# compare the cleaned and uncleaned tweets
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


## Vetorizando valores usando TfidfVectorizer

In [14]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [26]:
# extract the labels from the train data
y = train.label.values

X = vectorizer.fit_transform(train.clean_tweet.values)

print(y.shape)
print(X.shape)

(31962,)
(31962, 22640)


## Separando dados para teste e Treino

In [16]:
from sklearn.model_selection import train_test_split

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, 
                                                    test_size=0.3, shuffle=True)

## Contruindo o Modelo

Aplicando Naive Bayes

In [17]:
clf = naive_bayes.MultinomialNB()
clf.fit(x_train, y_train)

MultinomialNB()

## Verificando a precisão do Naive Bayes

In [18]:
print("Accuracy score for Naive Bayes is: ", roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) * 100, '%')

Accuracy score for Naive Bayes is:  82.51412735469593 %
