<a href="https://colab.research.google.com/github/jnlinao/NLP/blob/main/Wk5_Features_and_Errors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import packages
Make sure you installed ***eli5***, ***tabulate***, ***sklearn***, ***matplotlib*** and ***numpy*** if you use your local machine

In [None]:
!pip install eli5
!pip install tabulate
!pip install spacy



In [None]:
import eli5
import tabulate
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import spacy
from sklearn import datasets
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Fun with Spacy
NER/Part of speech tagging

In [None]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

displacy.render(doc, style="dep")
displacy.render(doc, style="ent")

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Apple\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is looking at buying \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    U.K.\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">GPE</span>\n</mark>\n startup for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $1 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35e

In [None]:
doc = nlp("Thuan Pham, hired as Uber’s chief technology officer by former CEO Travis Kalanick back in 2013, is leaving the company in three weeks, the ride-share giant revealed today in an SEC filing that came out just as The Information reported that massive layoffs at Uber are being proposed to preserve some of the company’s dwindling capital reserves.")
displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Thuan Pham\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n, hired as \n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Uber\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n’s chief technology officer by former CEO \n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Travis Kalanick\n    <span style="font-size: 0.8em; font-weight: bold; li

# Feature Engineering

## Prepare dataset and Pick two classes
Your two classes should be similar, but opposite in some sense

In [None]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'),)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'),)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

train data size: 1168
test data size: 777


## Design your own features
This is also warm-up for HW2 :)

In [None]:
class CustomFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.feat_names = set()

    def fit(self, x, y=None):
        return self

    @staticmethod
    def features(review):
      return {
          'bias' : 4.0,
          'RAM' : test_binary_feature(review),
          'mac': mac_binary_feature(review),
          'apple': apple_binary_feature(review),
          'IBM': ibm_feature(review)
      }

    def get_feature_names(self):
        return list(self.feat_names)
      
    def transform(self, reviews):
      feats = []
      for review in reviews:
        f = self.features(review)
        [self.feat_names.add(k) for k in f] 
        feats.append(f)
      return feats
    
feats = make_pipeline(CustomFeats())
#feats = make_pipeline(CustomFeats(), DictVectorizer())
#feats = FeatureUnion([
#     ('custom', make_pipeline(CustomFeats(), DictVectorizer())),
#     ('bag_of_words', CountVectorizer())
# ])

In [None]:
def test_binary_feature(review):
  target_word = 'RAM'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  return count > threshold

def mac_binary_feature(review):
  target_word = 'mac'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  if count > threshold:
        return 1
  else:
        return 0

def ibm_feature(review):
  target_word = 'IBM'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  return count

def apple_binary_feature(review):
  target_word = 'apple'
  threshold = 0
  words = filter(lambda r: r.find(target_word) is not -1, review.split(' '))
  count = len(list(words))
  if count > threshold:
        return 1
  else:
        return 0

def show_table(train, Ω):
  matrix = np.zeros((2, 2))
  for i in range(len(train.data)):
    flag = Ω(train.data[i])
    index = 0 if flag else 1
    matrix[index][train.target[i]] += 1
  print(tabulate.tabulate([['True', matrix[0][0], matrix[0][1]], ['False', matrix[1][0], matrix[1][1]]], headers=['', train.target_names[0], train.target_names[1]]))


show_table(train, lambda r: r.find('apple') is not -1)

         comp.sys.ibm.pc.hardware    comp.sys.mac.hardware
-----  --------------------------  -----------------------
True                            2                       24
False                         588                      554


In [None]:
train.custvector = feats.fit_transform(train.data)

In [None]:
train.custvector[1]

{'IBM': 0, 'RAM': False, 'apple': 0, 'bias': 4.0, 'mac': 0}

## Number of Features
(#sample, #features)  

In [None]:
train.vecs = feats.fit_transform(train.data)
test.vecs = feats.transform(test.data)
train.vecs.shape, test.vecs.shape

In [None]:
feats.steps[0][1].get_feature_names()

In [None]:
print(train.vecs[1])

What if we add
- number-based feature with threshold
- number-based feature  

to ***features*** function?

In [None]:
lr_model = LogisticRegression(C=1)
lr_model.fit(train.vecs,train.target)
#lr_model = make_pipeline(CountVectorizer(), LogisticRegression())
#lr_model.fit(train.data, train.target)

train_preds = lr_model.predict(train.vecs)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = lr_model.predict(test.vecs)
test_f1 = f1_score(test.target, test_preds, average='micro')
print(train_f1, test_f1)

In [None]:
eli5.show_weights(lr_model, top=10, vec=feats.steps[0][1], target_names=test.target_names)

## False negative and positive examples

In [None]:
def show_false_negative(test_preds, test):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 0, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

def show_false_positive(test_preds, test, size=2):
  fn_idxs = list(filter(lambda idx: test_preds[idx] != test.target[idx] and test_preds[idx] == 1, range(len(test_preds))))
  if len(fn_idxs) == 0: return None
  fidx = np.random.randint(len(fn_idxs))
  return test.data[fn_idxs[fidx]]

In [None]:
show_false_negative(test_preds, test)

In [None]:
show_false_positive(test_preds, test)

# Error Analysis

In [None]:
# categories = ['alt.atheism', 'soc.religion.christian']
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
# categories = ['rec.sport.baseball', 'rec.sport.hockey']
# 'alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware','comp.windows.x', 'misc.forsale', 'rec.autos',  
# 'rec.motorcycles',  'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
# 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns',
# 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
train = sklearn.datasets.fetch_20newsgroups(subset='train', categories=categories)
test = sklearn.datasets.fetch_20newsgroups(subset='test', categories=categories)
print('train data size:', len(train.data))
print('test data size:', len(test.data))

In [None]:
lr_model = LogisticRegression(C=0.1)
vec = CountVectorizer()
pipe = make_pipeline(vec, lr_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

In [None]:
eli5.show_weights(pipe, top=10, target_names=test.target_names)

In [None]:
idx = 10
x = test.data[idx]
#print(test.data[idx])
print(test.target_names[test.target[idx]])
eli5.show_prediction(lr_model, test.data[idx], vec=vec, target_names=test.target_names)

In [None]:
rf_model = RandomForestClassifier()
vec = CountVectorizer()
pipe = make_pipeline(vec, rf_model)
pipe.fit(train.data, train.target)
train_preds = pipe.predict(train.data)
train_f1 = f1_score(train.target, train_preds, average='micro')
test_preds = pipe.predict(test.data)
test_f1 = f1_score(test.target, test_preds, average='micro')
train_f1, test_f1  

In [None]:
eli5.show_weights(pipe, top=10, target_names=test.target_names)

In [None]:
idx = 1
x = test.data[idx]
print(test.target_names[test.target[idx]])
eli5.show_prediction(rf_model, test.data[idx], vec=vec, target_names=test.target_names, top=10)