<a href="https://colab.research.google.com/github/jonathansantoss/portfolio-machine-learning/blob/master/Prever_ocorrencia_desastre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sklearn
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm, tqdm_notebook

# Importa os dados de treino

In [2]:
dados_treino = pd.read_csv('dados_treino.csv')
dados_treino.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Verifica se contem dados outliers

In [3]:
dados_treino.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

# Remove colunas com dados outliers

In [4]:
colunms = ['text', 'target']
dados_treino_copy = dados_treino[colunms]
dados_treino_copy.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Manipulação do texto

In [5]:
class Sequences():
    def __init__(self):
        self.vectorizer = CountVectorizer(stop_words = 'english', max_df = 0.99, min_df = 0.005)
        self.sequences = self.vectorizer.fit_transform(dados_treino_copy.text.tolist())
        self.labels = dados_treino_copy.target.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

dados_frases = Sequences()

train_loader = DataLoader(dados_frases, batch_size = 240)
train_loader

# Define o device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Criando Modelo

In [6]:
# Classificador
class BiLSTM(nn.Module):
    
    # Método construtor para inicializar os atributos
    def __init__(self, vocab_size, hidden_size, embedding_dim):
        super(BiLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_size, bidirectional=True)
        self.predictor = nn.Linear(hidden_size, 1)
    
    # Método para a passada para a frente (forward)
    def forward(self, seq):
      output, (hidden,_) = self.encoder(self.embedding(seq))
      preds = self.predictor(hidden.squeeze())
      return preds

In [82]:
modelo = BiLSTM(len(dados_frases.token2idx), 128, 256)

In [83]:
modelo

BiLSTM(
  (embedding): Embedding(240, 256)
  (encoder): LSTM(256, 128, bidirectional=True)
  (predictor): Linear(in_features=128, out_features=1, bias=True)
)

In [84]:
criterion = nn.BCEWithLogitsLoss()

In [85]:
# Adam dinamicamente altera a taxa de aprendizagem
optimizer = optim.AdamW([p for p in modelo.parameters() if p.requires_grad], lr = 0.001)

# Treinando

In [86]:
# Instância de treinamento do modelo
modelo.train()

# Lista para armazenar os erros a cada passada de treinamento
train_losses = []

# Número de épocas
epochs = 20

# Loop de treinamento
for epoch in range(epochs): 
    
    # Barra de progresso
    progress_bar = tqdm_notebook(train_loader, leave = False)
    
    # Listas de controle
    losses = []
    total = 0
    
    # Loop
    for inputs, target in progress_bar:
      
        # Modelo
      modelo.zero_grad()

        # Saída (previsão do modelo)
      output = modelo(inputs.squeeze())
      output = output[0].squeeze()

        # Cálculo do erro

      if len(target) == 240:
        loss = criterion(output, target.float())
      else:
        loss = criterion(output[0:173], target.float())

        # Instância do Backpropagation
        loss.backward()
        
        # Prepara atualização dos parâmetros (coeficientes)    
        nn.utils.clip_grad_norm_(modelo.parameters(), 3)

        # Executa o otimizador
        optimizer.step()
        
        # Atualiza a barra de progresso
        progress_bar.set_description(f'\nErro do Modelo: {loss.item():.3f}')
        
        # Erros e total
      losses.append(loss.item())
      total += 1
    
    # Erro da epoch
    epoch_loss = sum(losses) / total
    
    # Erro de treinamento
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tErro em Treinamento: {epoch_loss:.3f}')

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #1	Erro em Treinamento: 0.707


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #2	Erro em Treinamento: 0.692


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #3	Erro em Treinamento: 0.684


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #4	Erro em Treinamento: 0.692


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #5	Erro em Treinamento: 0.707


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #6	Erro em Treinamento: 0.709


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #7	Erro em Treinamento: 0.702


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #8	Erro em Treinamento: 0.694


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #9	Erro em Treinamento: 0.688


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #10	Erro em Treinamento: 0.686


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #11	Erro em Treinamento: 0.685


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #12	Erro em Treinamento: 0.685


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #13	Erro em Treinamento: 0.686


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #14	Erro em Treinamento: 0.687


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #15	Erro em Treinamento: 0.689


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #16	Erro em Treinamento: 0.692


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #17	Erro em Treinamento: 0.695


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #18	Erro em Treinamento: 0.697


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #19	Erro em Treinamento: 0.697


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Epoch #20	Erro em Treinamento: 0.696


# Prever se o desastre é real ou não

In [87]:
# Função para prever o desastre
def predict_disaster(dados):
    
    # Carrega o modelo
    modelo.eval()

    target = dados['text']

    previsoes = []

    for pred in target:
      with torch.no_grad():
        test_vector = torch.LongTensor(dados_frases.vectorizer.transform([pred]).toarray())

        # Previsão
        output = modelo(test_vector)
        output = output[0:1, 0:1]
        
        # Gera a previsão final como probabilidade
        prediction = torch.sigmoid(output).item()

        # Checa a probabilidade com limite de 0.5
        if prediction >= 0.5:
            previsoes.append(1)
        else:
          previsoes.append(0)
    
    return previsoes

In [88]:
dados_teste = pd.read_csv('dados_teste.csv')
dados_teste.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [89]:
y_pred = predict_disaster(dados_teste)

In [90]:
colunms = ['id', 'text']
submission = dados_teste[colunms]
submission['disaster'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [91]:
submission.to_csv('subimission', index=False)