Antes de executar o notebook, baixe os dados [nesse link](https://kaggle.com/competitions/nlp-getting-started) e os coloque em uma pasta com nome "dados".

In [1]:
# Importando as bibliotecas utilizadas

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# bibliotecas utilizadas para o pré-processamento do texto

import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [3]:
# Módulos baixados com o download do nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords') 
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /home/ilan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ilan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/ilan/nltk_data...
[nltk_data] Downloading package stopwords to /home/ilan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /home/ilan/nltk_data...


True

In [4]:
# Bibliotecas utilizadas na build

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [6]:
# Utilizada no embeding
from gensim.models import Word2Vec

In [7]:
# Base de dados
df_train = pd.read_csv('./dados/train.csv')
df_test = pd.read_csv('./dados/test.csv')

In [8]:
# Remove pontuações, converte letras maiúsculas pra minúsculas
# e faz o strip nas strings
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [9]:
# Remove palavras que não possuem relevância 
# para a classificação, como pronomes.

def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [10]:
# "LEMMATIZATION"
# Reduz as palavras para formas mais básicas
# ex: information -> inform
wl = WordNetLemmatizer()

# Função auxiliar
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
# Converte para tokens

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [12]:
# Etapa final do pré-processamento

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))
df_train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get sent photo ruby alaska smoke wildfires pou...


In [13]:
# Separando os dados em dados de treino e teste

X_train, X_test, y_train, y_test = train_test_split(df_train["clean_text"],df_train["target"],test_size=0.2,shuffle=True)

In [14]:
# Etapa do Word2Vec
# Utilizado nas palavras convertidas em tokens

X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [15]:
# Criando um modelo Word2Vec

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec

        self.dim = len(next(iter(word2vec.values())))
    
    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']] #convert preprocessed sentence to tokenized sentence
model = Word2Vec(df_train['clean_text_tok'],min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))
modelw = MeanEmbeddingVectorizer(w2v)

In [16]:
# Convertendo o texto para formato numérico com Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_test_vectors_w2v = modelw.transform(X_test_tok)

In [None]:
# Parâmetros utilizados a Grid Search
parametros = {
    'loss'              : ('hinge', 'modified_huber', 'perceptron', 'squared_error', 'huber'),
    'penalty'           : ('l2','l1','elasticnet'),
    'class_weight'      : ('balanced', None)  
}

# Monta a estrutura para a busca
grid = GridSearchCV(estimator=SGDClassifier(),
                    param_grid=parametros,
                    cv=5,
                    scoring='f1')


grid.fit(X_train_vectors_w2v, y_train)

preds = grid.predict(X_test_vectors_w2v)

pd.DataFrame(grid.cv_results_).sort_values(['rank_test_score'], ignore_index=True)[['params','rank_test_score','mean_test_score']]

In [58]:
for i in grid.best_params_:
    print(f'{i}: {grid.best_params_[i]}')

class_weight: None
loss: modified_huber
penalty: elasticnet


In [66]:
report = classification_report(y_test, preds, target_names=['Non-Disaster Tweet', 'Disaster Tweet'])

print(report)

                    precision    recall  f1-score   support

Non-Disaster Tweet       0.57      1.00      0.73       860
    Disaster Tweet       0.91      0.03      0.06       663

          accuracy                           0.58      1523
         macro avg       0.74      0.51      0.39      1523
      weighted avg       0.72      0.58      0.44      1523



In [39]:
from joblib import dump

dump(grid.best_estimator_, 'modelo_treinado.joblib')

['modelo_treinado.joblib']