In [3]:
import numpy as np
import pandas as pd
import sklearn
import torch
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import os

In [4]:
# Verification car je roule le projet sur ma propre machine
import tensorflow as tf

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1060 3GB'

In [5]:
try:
  from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
except ImportError:
  !pip3 install pytorch-pretrained-bert
  from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

## Chargement des Données

In [6]:
# Les données sont accessibles dans "/kaggle/input/".
# Ici, on affiche le contenu du dossier et des sous-dossiers
data_dir = './data/'

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
# Lecture des données d'entraînement
train_data = pd.read_csv(os.path.join(data_dir, 'snli_train.csv'))
train_data.head()

Unnamed: 0,sentence1,sentence2,label1,id
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral,0
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction,1
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment,2
3,Children smiling and waving at camera,They are smiling at their parents,neutral,3
4,Children smiling and waving at camera,There are children present,entailment,4


In [29]:
train_data, validation_data = train_test_split(train_data, test_size=0.2)
train_data.head()

Unnamed: 0,sentence1,sentence2,label1,id
179274,A man in suspenders bales crops near a body of...,A man is outside.,entailment,179277
94344,An old man in a black trench coat standing in ...,THe man is outside.,entailment,94347
273955,"Two dogs, one with its mouth open.",One of the 2 dogs opened their mouth to catch ...,neutral,273958
356396,One black female and one white female plus thr...,The avant garde group is performing in public.,neutral,356402
200840,A young woman dressed as angel passes out flie...,A person is dressed as an angel.,entailment,200843


## Prétraitement

In [8]:
nb_features = 768
nd_documents = len(train_data)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
np_array = torch.zeros([nd_documents, nb_features]) 

In [9]:
sentences = ["[CLS] " + x + " [SEP] " + y for x, y in zip(train_data['sentence1'], train_data['sentence2'])]
max_len = max([len(x) for x in sentences])

In [None]:
import time
from sklearn import preprocessing

def train_bert(data):
    length_data = len(data)
    start = time.time()
    embedded = torch.zeros([len(data), nb_features])
    index = 0
    with torch.no_grad():
        for sent1, sent2 in zip(data['sentence1'], data['sentence2']):
            sentence = "[CLS] " + sent1 + " [SEP] " + sent2
            tokenized_sentence = tokenizer.tokenize(sentence)
            sep_index = tokenized_sentence.index("[SEP]") 
            ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokenized_sentence)])
            segments = torch.tensor([0]*sep_index + [1]*(len(tokenized_sentence) - sep_index))
            encoded_layers, _ = model(ids, segments)

            mean_layers = torch.zeros([5, 768])
            for j in range(7, 12):
                mean_layers[j-7] = torch.mean(encoded_layers[j], 1)
            mean_layers = torch.mean(mean_layers, 0)
            embedded[index] = mean_layers

            if index % 50 == 0:
                print(f"Document: {index}/{length_data}")

            index += 1
    print(time.time() - start)
    
    le = preprocessing.LabelEncoder()
    return np.array(embedded), np.array(le.fit_transform(data["label1"]))

In [58]:
input("Are you sure you want to re-embed the whole corpus?? this takes a loooot of time. Jump to Plongement to load pre-made data")

Are you sure you want to re-embed the whole corpus?? this takes a loooot of time. Jump to Plongement to load pre-made data


''

In [57]:
train_emb, train_labels = train_bert(train_data.head(2000))
np.savetxt('train_emb_data.npy', np.array(train_emb))
np.savetxt('train_emb_labels.npy', np.array(train_labels))

Are you sure you want to re-embed the whole corpus?? this takes a loooot of time. Jump to Plongement to load pre-made data


KeyboardInterrupt: 

In [52]:
valid_emb, valid_labels = train_bert(validation_data.head(200))
np.savetxt('valid_emb_data.npy', np.array(valid_emb))
np.savetxt('valid_emb_labels.npy', np.array(valid_labels))

Document: 0/200
Document: 50/200
Document: 100/200
Document: 150/200
12.607025384902954


## Plongement

In [53]:
train_emb = np.loadtxt('train_emb_data.npy')
train_labels = np.loadtxt('train_emb_labels.npy')

In [54]:
valid_emb = np.loadtxt('valid_emb_data.npy')
valid_labels = np.loadtxt('valid_emb_labels.npy')

## Entrainement du modèle

In [41]:
# Ici, vous devez écrire votre modèle

# Le format idéalisé ressemblerait à :
#
# model = NLPModel
# model.learn(train_data)
# submission['score'] = model.predict(test_data)


## Validation du modèle

In [44]:
# calcul de l'accuracy
labels = train_data.label1.unique() # Placeholder le temps que l'on ait un vrai model. Calcule l'accuracy d'un modèle aléatoire
validation_copy = validation_data.copy()
validation_copy["label1"] = np.random.choice(labels, size=len(validation_copy))

sklearn.metrics.accuracy_score(validation_data["label1"], validation_copy["label1"])

0.33427321056187037

## Prediction des données de test

In [33]:
# Dans ce notebook, on fait juste des prédictions aléatoires
test_data = pd.read_csv(os.path.join(data_dir, 'snli_test.csv'))
test_data["label1"] = np.random.choice(labels, size=len(test_data))

## Enregistrement de la soumission

In [45]:
# On sauvegarde nos prédictions dans un fichier
# On peut alors soumettre ce fichier à Kaggle pour l'évaluation
submission = test_data[["id", "label1"]]
submission.to_csv('./out/submission.csv', index=False)
submission.head()

Unnamed: 0,id,label1
0,1000000,contradiction
1,1000001,neutral
2,1000002,neutral
3,1000003,entailment
4,1000004,contradiction
