In [1]:
!pip install sklearn_crfsuite



In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [8]:
#Reading the csv file
df = pd.read_csv('Datasets/ptbr_tweets.csv', 
                 sep="\t")

In [9]:
#Display first 10 rows
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence 1,eu,PRON,O
1,Sentence 1,sei,VERB,O
2,Sentence 1,que,SCONJ,O
3,Sentence 1,não,ADV,O
4,Sentence 1,sou,VERB,O
5,Sentence 1,escritor,ADJ,O
6,Sentence 1,e,CCONJ,O
7,Sentence 1,so,VERB,O
8,Sentence 1,uma,DET,O
9,Sentence 1,carta,NOUN,O


In [10]:
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,26875,26868,26875,26875
unique,2308,7958,17,7
top,Sentence 1707,",",PROPN,O
freq,50,730,5342,26143


In [11]:
#Displaying the unique Tags
df['Tag'].unique()

array(['O', 'B-person', 'B-location', 'I-location', 'I-person',
       'B-organization', 'I-organization'], dtype=object)

In [12]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-location,114
1,B-organization,125
2,B-person,291
3,I-location,69
4,I-organization,47
5,I-person,86
6,O,26143


In [13]:
#Checking null values, if any.
df.isnull().sum()

Sentence #    0
Word          7
POS           0
Tag           0
dtype: int64

In [14]:
df = df.fillna(method = 'ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence 1,eu,PRON,O
1,Sentence 1,sei,VERB,O
2,Sentence 1,que,SCONJ,O
3,Sentence 1,não,ADV,O
4,Sentence 1,sou,VERB,O


In [15]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [16]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'eu sei que não sou escritor e so uma carta de amor de alguém que te quer de verdade 🎵 ❤'

In [17]:
sentences = getter.sentences
sentences[0]

[('eu', 'PRON', 'O'),
 ('sei', 'VERB', 'O'),
 ('que', 'SCONJ', 'O'),
 ('não', 'ADV', 'O'),
 ('sou', 'VERB', 'O'),
 ('escritor', 'ADJ', 'O'),
 ('e', 'CCONJ', 'O'),
 ('so', 'VERB', 'O'),
 ('uma', 'DET', 'O'),
 ('carta', 'NOUN', 'O'),
 ('de', 'ADP', 'O'),
 ('amor', 'NOUN', 'O'),
 ('de', 'ADP', 'O'),
 ('alguém', 'PRON', 'O'),
 ('que', 'PRON', 'O'),
 ('te', 'VERB', 'O'),
 ('quer', 'VERB', 'O'),
 ('de', 'ADP', 'O'),
 ('verdade', 'NOUN', 'O'),
 ('🎵', 'PROPN', 'O'),
 ('❤', 'PROPN', 'O')]

In [37]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'postag': postag, # 1) Tag
        'word.lower()': word.lower(), # 2) Word
        'word': word, # 13) cap
        'word.istitle()': word.istitle(), # 14) ini
        'word.isdigit()': word.isdigit(), # 15) digit
        'word.isupper()': word.isupper() # extra
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(), # 3) prevW
            '-1:postag': postag1, # 4) prevT
            '-1:word': word1, # 5) prevCap

        })
    elif i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()' : word2.lower(), # 6) prev2W
            '-2:postag': postag2, # 7) prev2T
            '-2:word': word2, # 8) prev2Cap
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(), # 9) nextW
            '+1:postag': postag1, # 10) nextT
            '+1:word': word, # 11) nextCap
        })
    
    elif i < len(sent)-2:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+2:word.lower()': word1.lower(), # 9) nextW
            '+2:postag': postag1, # 10) nextT
            '+2:word': word, # 11) nextCap
        })
    
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]
  

In [38]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [40]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [41]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

In [42]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.9629829240964188


In [43]:
report = flat_classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

    B-location       0.67      0.17      0.28        23
B-organization       0.75      0.14      0.24        21
      B-person       0.62      0.17      0.27        75
    I-location       0.40      0.14      0.21        14
I-organization       1.00      0.07      0.13        14
      I-person       0.60      0.15      0.24        20
             O       0.97      1.00      0.99      5244

     micro avg       0.97      0.97      0.97      5411
     macro avg       0.72      0.26      0.34      5411
  weighted avg       0.96      0.97      0.96      5411

