In [58]:
from nltk.tag import StanfordNERTagger
# from nltk.tag import corenlp
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import pandas as pd

In [59]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Download and import the Stanford NER 4 classes model.

In [60]:
st = StanfordNERTagger('/content/drive/MyDrive/Colab Notebooks/Capstone_govt_of_canada/StanfordNER/english.conll.4class.distsim.crf.ser.gz',
					   '/content/drive/MyDrive/Colab Notebooks/Capstone_govt_of_canada/StanfordNER/stanford-ner-4.2.0.jar',
					   encoding='utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [61]:
NP_POS = {"DT", "NN", "JJ", "PR"}  # these are the first two letters of the POS that you should consider potential parts of nouns 
NP_HEAD_POS = {"NN", "PR"}  # each chunk must have at least one of these

def get_chunks(sentence):
    '''Extracts noun phrases from a sentence corresponding to the part-of-speech tags in optional_POS,
    requiring at least one of the POS tags in required_POS. Returns the chunks as a list of strings'''

    chunks = []
    tagged = pos_tag(word_tokenize(sentence))
    start = -1
    seen_required = False
    for i in range(len(tagged)):
        if tagged[i][1][:2] in NP_POS:
            if start == -1:
                start = i
            if tagged[i][1][:2] in NP_HEAD_POS:
                seen_required = True
        else:
            if start != -1:
                if seen_required:
                    chunks.append(" ".join([pair[0] for pair in tagged[start:i]]))
                start = -1
                seen_required = False
    if start != -1:
        if seen_required:
            chunks.append(" ".join([pair[0] for pair in tagged[start:]]))

    return chunks

In [62]:
NER_CLASSES = {"ORGANIZATION", "PERSON", "LOCATION", "MISC"}
def get_ner_chunks(text):
  '''Extracts name entities from a sentence corresponding to the NER tags in Stanford 4 classies,
  return the chunks as a list of tuples'''
  chunks = []
  tokenized_text = word_tokenize(text)
  classified_text = st.tag(tokenized_text)
  i = 0
  # print(classified_text)
  while i < len(classified_text):
    # print(i)
    if classified_text[i][1] in NER_CLASSES:
      chunk = []
      # print(classified_text[i][1])
      for j in range(i, len(classified_text)):
        if classified_text[i][1] == classified_text[j][1]:
          chunk.append(classified_text[j][0])
          # print(chunk)
        else:
          chunks.append((" ".join(chunk), classified_text[i][1]))
          i=j
          break
    else:
      i+=1
  return chunks

In [63]:
data_path = "/content/drive/MyDrive/Colab Notebooks/Capstone_govt_of_canada/data/Page feedback-Vaccine pages-May17.csv"
df = pd.read_csv(data_path, encoding="utf-8")
df = df[df['Tags confirmed']=='checked'][df['Lang'] == 'EN']
train_df, dev_df = train_test_split(df,  test_size=0.2, random_state=11)
# train_df

In [64]:
text_lst = train_df.Comment.values
chunks_lst = []
ner_lst = []
for text in text_lst:
  chunks_lst.append(" ".join(get_chunks(text)))
  # ner_lst.append(get_ner_chunks(text))


In [65]:
print(chunks_lst[1])

Which type vaccine Canada


In [66]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score
from collections import defaultdict
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

In [67]:
ner_vectorizer = CountVectorizer(ngram_range=(1,1),min_df=2)

In [68]:
NER_vec = ner_vectorizer.fit_transform(chunks_lst)

In [69]:
NER_vec.shape

(20660, 3507)

In [70]:
vectorizer = CountVectorizer(ngram_range=(1,2),min_df=2)
train_texts = train_df.Comment.values

X_train = vectorizer.fit_transform(train_texts)

In [49]:
from scipy.sparse import coo_matrix, hstack
X_train = hstack([X_train, NER_vec])

X_train.shape

(20660, 29718)

In [71]:
def prepare_for_classification(train,dev,max_n=2):
  '''convert lists of reviews train and dev to spare feature matrices X_train and X_test,
  and lists of polarity classifications train_class and dev_class'''
  vectorizer = CountVectorizer(ngram_range=(1,max_n),min_df=2)
  ner_vectorizer = CountVectorizer(ngram_range=(1,1),min_df=2)
  ner_train = ner_vectorizer.fit_transform(chunks_lst)
  train_texts = train.Comment.values
  train_class = train.Tags.values
  dev_texts = dev.Comment.values
  dev_class = dev.Tags.values
  X_train = vectorizer.fit_transform(train_texts)
  X_train = hstack([X_train, ner_train])
  X_dev = vectorizer.transform(dev_texts)
  ner_dev = ner_vectorizer.transform(dev_texts)
  X_dev = hstack([X_dev, ner_dev])
  return X_train,train_class, X_dev,dev_class

In [72]:
def evluate(train, test, n = 2):
    """Calculate the the kendalltau score from given train test data set, and n grams"""
    
    X_train,train_class, X_test,test_class = prepare_for_classification(train,test,max_n=n)
    clf = LinearSVC()
    clf.fit(X_train,train_class)
    
    fscore = f1_score(test_class, clf.predict(X_test), average='macro')
    acc = accuracy_score(test_class, clf.predict(X_test))
    
    print(f"{n}-gram: The shape of training set is {X_train.shape}, the fscore is {fscore}, the accuracy is {acc}")

    return fscore
    
 
print("Vaccine:")
for n in range(1, 6):
    evluate(train_df,dev_df, n)

Vaccine:




1-gram: The shape of training set is (20660, 8462), the fscore is 0.6199512593917744, the accuracy is 0.7473378509196515
2-gram: The shape of training set is (20660, 29718), the fscore is 0.6617908517897326, the accuracy is 0.7864472410454986
3-gram: The shape of training set is (20660, 49829), the fscore is 0.6664901825064218, the accuracy is 0.7930300096805422
4-gram: The shape of training set is (20660, 61759), the fscore is 0.670085611169692, the accuracy is 0.7961277831558567
5-gram: The shape of training set is (20660, 67761), the fscore is 0.6702186034348959, the accuracy is 0.7955469506292352
