In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U 'scikit-learn<0.24'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
df_pre = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NER_dataset2_francais.csv", encoding = "ISO-8859-1")
df_pre.head()



Unnamed: 0.1,Unnamed: 0,company,address,name,phone
0,0,TRANSPORT HUCK,3 Rue Adolphe Moehler - ZI Nord 67210 OBERNAI ...,Mme Denise Scherer,307647897
1,1,BORDEAUX SELF STOCKAGE,15 RUE FRANCIS GARNIER 33300 BORDEAUX France,M. Vincent Leygonie,369587159
2,2,RIOLS,4 rue Charles Coulomb-Zac La Chartreuse 81100 ...,Mme Amandine Riols,456851367
3,3,PLANET'TRANSPORT,1 chemin La Sablonnire 45230 CHATILLON COLIGNY...,M. Didier Huck,542057743
4,4,LOIRET EXPRESS,111 r Andr Gide 45120 CHALETTE SUR LOING France,M. Philippe Wahl,641228510


In [None]:
df_pre.head()

Unnamed: 0.1,Unnamed: 0,company,address,name,phone
0,0,TRANSPORT HUCK,3 Rue Adolphe Moehler - ZI Nord 67210 OBERNAI ...,Mme Denise Scherer,307647897
1,1,BORDEAUX SELF STOCKAGE,15 RUE FRANCIS GARNIER 33300 BORDEAUX France,M. Vincent Leygonie,369587159
2,2,RIOLS,4 rue Charles Coulomb-Zac La Chartreuse 81100 ...,Mme Amandine Riols,456851367
3,3,PLANET'TRANSPORT,1 chemin La Sablonnire 45230 CHATILLON COLIGNY...,M. Didier Huck,542057743
4,4,LOIRET EXPRESS,111 r Andr Gide 45120 CHALETTE SUR LOING France,M. Philippe Wahl,641228510


In [None]:
df_pre.shape

(2818, 5)

In [None]:
df_pre['company'][0]

'TRANSPORT HUCK'

In [None]:
import random

temp_data = [[],[],[]]

for i in range(len(df_pre)):
    possibilites = ['company','address','name','phone']
    tags = ['C','Add','N','T']
    while(len(possibilites) != 0):
        n = random.randint(0,len(possibilites)-1)
        champs = possibilites[n]
        tag = tags[n]
        phrase = str(df_pre[champs][i]).split()
        possibilites.pop(n)
        tags.pop(n)
        for j in range(0,len(phrase)):
            temp_data[0].append(str(i))
            temp_data[1].append(phrase[j]+' ')
            if(j==0):
                temp_data[2].append("B-"+tag)
            else:
                temp_data[2].append("I-"+tag)

df = pd.DataFrame({'Sentence #':temp_data[0],'Word':temp_data[1],'Tag':temp_data[2]})

In [None]:
df

Unnamed: 0,Sentence #,Word,Tag
0,0,Mme,B-N
1,0,Denise,I-N
2,0,Scherer,I-N
3,0,3,B-Add
4,0,Rue,I-Add
...,...,...,...
38024,2817,HENRI,I-Add
38025,2817,GUILBERT,I-Add
38026,2817,94110,I-Add
38027,2817,ARCUEIL,I-Add


In [None]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word
0,0,Mme
1,0,Denise
2,0,Scherer
3,0,3
4,0,Rue


In [None]:
df.columns.to_numpy()

array(['Sentence #', 'Word', 'Tag'], dtype=object)

In [None]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(38029, 16845)

In [None]:
y = df.Tag.values

In [None]:
classes = np.unique(y)

In [None]:
classes = classes.tolist()
classes

['B-Add', 'B-C', 'B-N', 'B-T', 'I-Add', 'I-C', 'I-N']

In [None]:
X.shape, y.shape

((38029, 16845), (38029,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [None]:
X_train.shape, y_train.shape

((25479, 16845), (25479,))

In [None]:
new_classes = classes.copy()
new_classes

['B-Add', 'B-C', 'B-N', 'B-T', 'I-Add', 'I-C', 'I-N']

In [None]:
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Conditional Random Fields (CRFs)

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

#### Get sentences

In [None]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),  s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped['{}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(df)

In [None]:
sent = getter.get_next()
print(sent)

print("[CLS] " + " ".join([w[0] for w in sent]) + " [SEP]")

[('15 ', 'B-Add'), ('RUE ', 'I-Add'), ('FRANCIS ', 'I-Add'), ('GARNIER ', 'I-Add'), ('33300 ', 'I-Add'), ('BORDEAUX ', 'I-Add'), ('France ', 'I-Add'), ('BORDEAUX ', 'B-C'), ('SELF ', 'I-C'), ('STOCKAGE ', 'I-C'), ('M. ', 'B-N'), ('Vincent ', 'I-N'), ('Leygonie ', 'I-N'), ('369587159 ', 'B-T')]
[CLS] 15  RUE  FRANCIS  GARNIER  33300  BORDEAUX  France  BORDEAUX  SELF  STOCKAGE  M.  Vincent  Leygonie  369587159  [SEP]


In [None]:
sentences = getter.sentences

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

### Words embeddings with bert

In [None]:
!pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM


tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

model = BertModel.from_pretrained('bert-base-cased')
model.eval()

def get_word_embedings(sent):
    marked_text = "[CLS] " + " ".join([w[0] for w in sent]) + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    segments_ids = [1] * len(tokenized_text)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)


    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove the batches
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_sum = []

    # token_embeddings have the dimensions 23x12x768
    for token in token_embeddings:

        # token have 12x768 dimension
        sum_vec = torch.sum(token[-4:], dim=0)

        token_vecs_sum.append(sum_vec)

    return [t.numpy() for t in token_vecs_sum]




The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [None]:
import datetime

def is_date(date_text):
    try:
        datetime.datetime.strptime(date_text, '%Y-%m-%d')
    except ValueError:
        try:
            datetime.datetime.strptime(date_text, '%d-%m-%Y')
        except ValueError:
            try:
                datetime.datetime.strptime(date_text, '%m-%d-%Y')
            except ValueError:
                return 0
    return 1

def word2features(sent, i, embedings):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),

    }

    #for j in range(0,len(embedings)):
       #features['bert_v_'+str(j)] = embedings[i+1][j]#'bert_vector': embedings[i+1] # le token [CLS] au début va décaler de 1 la liste des mots de celle des embedings

    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    embedings = get_word_embedings(sent)
    #print ('Shape is: %d x %d' % (len(embedings), len(embedings[0])))
    #print('sentence size: %d' % len(sent))
    return [word2features(sent, i, embedings) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]



The above code were taken from sklearn-crfsuite official site.

Split train and test sets.

In [None]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.9512914351683573

In [None]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

       B-Add       0.90      0.91      0.90       930
         B-C       0.90      0.86      0.88       930
         B-N       0.89      0.88      0.89       930
         B-T       0.97      0.96      0.96       930
       I-Add       0.99      0.99      0.99      6453
         I-C       0.92      0.90      0.91      1170
         I-N       0.90      0.94      0.92      1153

    accuracy                           0.95     12496
   macro avg       0.92      0.92      0.92     12496
weighted avg       0.95      0.95      0.95     12496





In [None]:

with open('/content/drive/MyDrive/Colab Notebooks/resultats_CRF_BERT_francais.txt', 'w') as f:
    f.write("sentence|    word                   | y        | pred\n\n\n\n")
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            f.write(str(i)+"|"+    X_test[i][j]["word.lower()"] +"|"+ y_test[i][j]    +"|"+ y_pred[i][j]+"\n")
        f.write("\n\n\n\n")

In [None]:
y_test

In [None]:
import ast

with open('/content/drive/MyDrive/Colab Notebooks/test_labels.txt') as f:
    lines = f.readlines()

test_labels_list_file = [ast.literal_eval(line) for line in lines]

with open('/content/drive/MyDrive/Colab Notebooks/pred_labels.txt') as f:
    lines = f.readlines()

test_preds_file= [ast.literal_eval(line) for line in lines]

In [None]:
tag2idx_list = ['I-C', 'I-Add', 'B-C', 'B-Add', 'B-T', 'B-N', 'I-N', 'PAD']

In [None]:
test_preds

In [None]:
#test_labels_list = [[tag2idx_list[e] for e in arr] for arr in test_labels_list_file]
len(test_labels_list_file)

1