<a href="https://colab.research.google.com/github/heriswn/LatihanDTS/blob/master/0908_heri002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PreProsessing
# Feature Extraction

In [55]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [56]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [0]:
def features(sentence, index):
  """ sentence: [w1, w2, ...], index: the index of the word """
  return {
      'word': sentence[index],
      'is_first': index == 0,
      'is_last': index==len(sentence) - 1,
      'is_capitalized': sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'prefix-1': sentence[index][0],
      'prefix-2': sentence[index][:2],
      'prefix-3': sentence[index][:3],
      'suffix-1': sentence[index][-1],
      'suffix-2': sentence[index][-2:],
      'suffix-3': sentence[index][-3:],
      'prev_word': '' if index == 0 else sentence[index -1],
      'prev_2word': '' if (index == 0) or (index == 1) else sentence[index -2],
      'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
      'next_2word': '' if index >= len(sentence) - 2 else sentence[index +2],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
  }

In [0]:
from nltk.tag.util import untag

cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

In [0]:
def transform_to_dataset(tagged_sentences):
  X, y = [], []
  
  for tagged in tagged_sentences:
    X.append([features(untag(tagged), index) for index in range(len(tagged))])
    y.append([tag for _, tag in tagged])
    
  return X, y

In [0]:
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

In [99]:
print(len(X_train))
print(len(X_test))
print(X_train[0])
print(y_train[0])

2935
979
[{'word': 'Pierre', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'P', 'prefix-2': 'Pi', 'prefix-3': 'Pie', 'suffix-1': 'e', 'suffix-2': 're', 'suffix-3': 'rre', 'prev_word': '', 'prev_2word': '', 'next_word': 'Vinken', 'next_2word': ',', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'Vinken', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'V', 'prefix-2': 'Vi', 'prefix-3': 'Vin', 'suffix-1': 'n', 'suffix-2': 'en', 'suffix-3': 'ken', 'prev_word': 'Pierre', 'prev_2word': '', 'next_word': ',', 'next_2word': '61', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': ',', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': True, 'is_all_lower': True, 'prefix-1': ',', 'prefix-2': ',', 'prefix-3': ',', 'suffix-1': ',', 'suffix-2': ',', 'suffix-3': ',', 'prev_word'

In [0]:
#!pip install sklearn_crfsuite

In [0]:
from sklearn_crfsuite import CRF

In [0]:
model = CRF()
model.fit(X_train, y_train)
from sklearn_crfsuite import metrics

## Stop disini

In [76]:
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9602683593122289


### Delete prefix feature

In [85]:
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9541619797525309


### Add prev_2word and next_2word

In [103]:
y_pred = model.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))

0.9611120038566607


# NER with ML

In [0]:
import pandas as pd
import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
from sklearn.model_selection import train_test_split

In [123]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:1000000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [124]:
df = df.fillna(method="ffill")
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(45729, 34370, 17)

In [0]:
class SentenceGetter(object):
  
  def __init__(self, data):
    self.n_sent = 1
    self.data = data
    self.empy = False
    agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                      s['POS'].values.tolist(),
                                                      s['Tag'].values.tolist())]
    self.grouped = self.data.groupby('Sentence #').apply(agg_func)
    self.sentences = [s for s in self.grouped]
    
  def get_next(self):
    try:
      s = self.grouped['Sentence: {}'.format(self.n_sent)]
      self.n_sent += 1
      return s
    except:
      return None

getter = SentenceGetter(df)
sentences = getter.sentences

In [0]:
def word2features(sent, i):
  word = sent[i][0]
  postag = sent[i][0]
  
  features = {
      'bias': 1.0,
      'word.lower()': word.lower(),
      'word[-3:]': word[-3:],
      'word[-2:]': word[-2:],
      'word.isupper()': word.isupper(),
      'word.istitle()': word.istitle(),
      'word.isdigit()': word.isdigit(),
      'postag': postag,
      'postag[:2]': postag[:2],
  }
  if i > 0:
    word1 = sent[i-1][0]
    postag1 = sent[i-1][1]
    features.update({
        '-1:word.lower()': word1.lower(),
        '-1:word.istitle()': word1.istitle(),
        '-1:word.isupper()': word1.isupper(),
        '-1:postag': postag1,
        '-1:postag[:2]': postag1[:2],
    })
  else:
    features['BOS'] = True
  if i < len(sent)-1:
    word1 = sent[i+1][0]
    postag1 = sent[i+1][1]
    features.update({
        '-1:word.lower()': word1.lower(),
        '-1:word.istitle()': word1.istitle(),
        '-1:word.isupper()': word1.isupper(),
        '-1:postag': postag1,
        '-1:postag[:2]': postag1[:2],        
    })
  else:
    features['EOS'] = True
  return features

def sent2features(sent):
  return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
  return [label for token, postag, label in sent]

def sent2tokens(sent):
  return [token for token, postag, label in sent]

In [0]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [128]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [129]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-art       0.48      0.13      0.20       126
       B-eve       0.62      0.45      0.52       100
       B-geo       0.84      0.90      0.87     11755
       B-gpe       0.96      0.93      0.94      5062
       B-nat       0.59      0.38      0.46        58
       B-org       0.78      0.70      0.74      6303
       B-per       0.84      0.80      0.82      5341
       B-tim       0.92      0.87      0.89      6329
       I-art       0.23      0.03      0.05       101
       I-eve       0.54      0.34      0.41        86
       I-geo       0.80      0.78      0.79      2356
       I-gpe       0.84      0.47      0.60        68
       I-nat       1.00      0.35      0.52        23
       I-org       0.77      0.79      0.78      5321
       I-per       0.83      0.88      0.86      5351
       I-tim       0.83      0.73      0.78      2031
           O       0.99      0.99      0.99    278297

    accuracy              

In [134]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron

X = df.drop('Tag', axis=1)
v = DictVectorizer()
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train2.shape, y_train2.shape

((670000, 80141), (670000,))

In [136]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train2, y_train2, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1
-- Epoch 1
Norm: 19.95, NNZs: 371, Bias: -0.400000, T: 670000, Avg. loss: 0.000468
Total training time: 0.37 seconds.
Norm: 25.87, NNZs: 627, Bias: -0.370000, T: 670000, Avg. loss: 0.000571
Total training time: 0.40 seconds.
-- Epoch 1
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s


Norm: 58.25, NNZs: 2545, Bias: -0.570000, T: 670000, Avg. loss: 0.002963
Total training time: 0.29 seconds.
Norm: 128.81, NNZs: 13303, Bias: -0.640000, T: 670000, Avg. loss: 0.021088
Total training time: 0.31 seconds.
-- Epoch 1
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.8s


Norm: 16.25, NNZs: 242, Bias: -0.300000, T: 670000, Avg. loss: 0.000317
Total training time: 0.24 seconds.
-- Epoch 1
Norm: 119.58, NNZs: 11971, Bias: -0.640000, T: 670000, Avg. loss: 0.021285
Total training time: 0.33 seconds.
-- Epoch 1
Norm: 103.74, NNZs: 8981, Bias: -0.570000, T: 670000, Avg. loss: 0.012600
Total training time: 0.24 seconds.
-- Epoch 1
Norm: 87.74, NNZs: 6362, Bias: -0.640000, T: 670000, Avg. loss: 0.012379
Total training time: 0.30 seconds.
-- Epoch 1
Norm: 21.47, NNZs: 416, Bias: -0.430000, T: 670000, Avg. loss: 0.000435
Total training time: 0.28 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.6s


Norm: 19.60, NNZs: 333, Bias: -0.340000, T: 670000, Avg. loss: 0.000401
Total training time: 0.36 seconds.
Norm: 73.51, NNZs: 4610, Bias: -0.590000, T: 670000, Avg. loss: 0.006807
Total training time: 0.25 seconds.
-- Epoch 1
-- Epoch 1
Norm: 15.39, NNZs: 217, Bias: -0.270000, T: 670000, Avg. loss: 0.000269
Total training time: 0.27 seconds.
-- Epoch 1
Norm: 9.54, NNZs: 85, Bias: -0.190000, T: 670000, Avg. loss: 0.000084
Total training time: 0.32 seconds.
-- Epoch 1
Norm: 115.93, NNZs: 10715, Bias: -0.650000, T: 670000, Avg. loss: 0.012526
Total training time: 0.27 seconds.
Norm: 110.90, NNZs: 9440, Bias: -0.710000, T: 670000, Avg. loss: 0.016277
Total training time: 0.36 seconds.
-- Epoch 1
-- Epoch 1


[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.5s


Norm: 76.79, NNZs: 4863, Bias: -0.720000, T: 670000, Avg. loss: 0.010360
Total training time: 0.27 seconds.
Norm: 145.70, NNZs: 15506, Bias: 0.600000, T: 670000, Avg. loss: 0.031883
Total training time: 0.29 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:    2.9s finished


Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=5, n_iter_no_change=5, n_jobs=-1,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=10, warm_start=False)