In [1]:
# Importando as bibliotecas utilizadas

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# bibliotecas utilizadas para o pré-processamento do texto

import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [3]:
# Módulos baixados com o download do nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords') 
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /home/ilan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ilan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ilan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ilan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ilan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Bibliotecas utilizadas na build

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [5]:
# Para vetorizar as palavras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Utilizada no embeding
import gensim
from gensim.models import Word2Vec

In [7]:
# Base de dados

df_train = pd.read_csv('./dados/train.csv')
df_test = pd.read_csv('./dados/test.csv')

In [8]:
# Remove pontuações, converte letras maiúsculas pra minúsculas
# e faz o strip nas strings
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [9]:
# Remove palavras que não possuem relevância 
# para a classificação, como pronomes.

def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [10]:
# "LEMMATIZATION"
# Reduz as palavras para formas mais básicas
# ex: information -> inform
wl = WordNetLemmatizer()

# Função auxiliar
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
# Converte para tokens

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [12]:
# Etapa final do pré-processamento

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))
df_train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get sent photo ruby alaska smoke wildfires pou...


In [13]:
# Separando os dados em dados de treino e teste

X_train, X_test, y_train, y_test = train_test_split(df_train["clean_text"],df_train["target"],test_size=0.2,shuffle=True)

In [14]:
# Etapa do Word2Vec
# Utilizado nas palavras convertidas em tokens

X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [15]:
# Criando um modelo Word2Vec

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    
    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']] #convert preprocessed sentence to tokenized sentence
model = Word2Vec(df_train['clean_text_tok'],min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))
modelw = MeanEmbeddingVectorizer(w2v)

In [16]:
# Convertendo o texto para formato numérico com Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_test_vectors_w2v = modelw.transform(X_test_tok)

In [17]:
# Parâmetros utilizados a Grid Search
parametros = {
    'penalty':('l2','l1','elasticnet', None),
    'class_weight':('balanced', None)  
}

# Monta a estrutura para a busca
grid = GridSearchCV(estimator=Perceptron(),
                    param_grid=parametros,
                    cv=5,
                    scoring='f1')


grid.fit(X_train_vectors_w2v, y_train)

preds = grid.predict(X_test_vectors_w2v)

pd.DataFrame(grid.cv_results_).sort_values(['rank_test_score'], ignore_index=True)[['params','rank_test_score','mean_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score
0,"{'class_weight': None, 'penalty': None}",1,0.626583
1,"{'class_weight': None, 'penalty': 'elasticnet'}",2,0.483412
2,"{'class_weight': 'balanced', 'penalty': 'elast...",3,0.410162
3,"{'class_weight': None, 'penalty': 'l2'}",4,0.363404
4,"{'class_weight': 'balanced', 'penalty': 'l1'}",5,0.348824
5,"{'class_weight': 'balanced', 'penalty': None}",6,0.224258
6,"{'class_weight': None, 'penalty': 'l1'}",7,0.153999
7,"{'class_weight': 'balanced', 'penalty': 'l2'}",8,0.120299


In [18]:
for i in grid.best_params_:
    print(f'{i}: {grid.best_params_[i]}')

class_weight: None
penalty: None


In [19]:
confusion_matrix = pd.crosstab(y_test, preds)
report = classification_report(y_test, preds)

print('0 --> Non-disaster tweet')
print('1 --> Disaster tweet')
print(report)

0 --> Non-disaster tweet
1 --> Disaster tweet
              precision    recall  f1-score   support

           0       0.58      1.00      0.73       870
           1       0.96      0.04      0.08       653

    accuracy                           0.59      1523
   macro avg       0.77      0.52      0.41      1523
weighted avg       0.75      0.59      0.45      1523



In [20]:
from joblib import dump

dump(grid.best_estimator_, 'modelo_ala.joblib')

['modelo_ala.joblib']