Notes:
- Skip polyglot

# NLP pipeline
Objective: create a NLP pipeline that is reusable

Other things to check
- n-grams - ie. word pairs in 2-gram


# Example: Amazon product reviews
Data source:
- https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M
- amazon_review_full_csv.tar.gz

Data overview:
- Amazon reviews full score dataset is constructed by randomly taking 600,000 training samples and 130,000 testing samples for each review score from 1 to 5. 
- In total there are 3,000,000 trainig samples and 650,000 testing samples.
- The files train.csv and test.csv contain all the training samples as comma-sparated values. 
- There are 3 columns in them, corresponding to class index (1 to 5), review title and review text. 
- The review title and text are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".

# Import Packages

In [7]:
import re
import os
import nltk
from nltk.tokenize import (word_tokenize, sent_tokenize, 
regexp_tokenize)
# for tweets
# from nltk.tokenize import TweetTokenizer
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidModel
import spacy

from matplotlib import pyplot as plt
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (CountVectorizer, 
TfidfVectorizer)
# NB works well with CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

ModuleNotFoundError: No module named 'gensim'

In [None]:
# if we need to display images
from IPython.display import Image, SVG
SVG(filename='Images/nlp_linelength.svg')

# Load files

In [8]:
path = 'Documents⁩/GitHub⁩/portfolio⁩/support files⁩/amazon_review_full_csv⁩'
trainfile = 'train.csv'
testfile = 'test.csv'


# Text preprocessing
Remove
- stop words
- non-alpha characters
- lemmatize
- perform bag-of-words

In [None]:
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

# gensim
Advantages
- uses top academic models to perform complex tasks
    - building document or word vectors
    - performing topic identification and document comparison
- LDA used for topic analysis and modeling
- corpus/corpora = set of texts used to perform NLP tasks
- gensim models can be easily saved, updated, and reused
- dictionary can also be updated
    - with new texts
    - words that meet certain thresholds
    - then use for feature exercises

In [None]:


# use bag of words corpus and translate to TfidfModel
tfidf = TfidfModel(corpus)

# reference each document like a dictionary
# displays [(token_id, token_weights)]
tfidf[corpus[1]]

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, 
                              key=lambda w: w[1], 
                              reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)
    

# Named-entity recognition (NER)
- Who? What? When? Where?


# spaCy
Advantages using spaCy for NER
- focus on creating NLP pipelines to generate model and corpora
- informal language corpora
    - easily find entities in Tweets and chat messages

- NLP library similar to gensim, with different implementations
- additional NER compared to nltk
    - NORP, CARDINAL, MONEY, WORKOFART, LANGUAGE, EVENT
- displaCy
    - entity recognition visualization tool to view parse trees
    - which uses Node.js to create interactive text


In [None]:
nlp = spacy.load('en')

# Instantiate the English model: nlp
# Additional args to improve execution time
nlp = spacy.load('en', tagger=False, parser=False, 
                 matcher=False)

# load new document
doc = nlp("""Berlin is the capital of Germany;
and the residence of Chancellor Angela Merkel.""")

# named entities are stored in .ents
print(doc.ents)

# check out each label (.label_) using indexing
print(doc.ents[0], doc.ents[0].label_)

# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)

# ML pipeline

In [None]:
# load data into a dataframe
df = ...
# set target label
y = df['Sci-Fi']
X_train, X_test, y_train, y_test = train_test_split(
df['plot'], y, test_size=0.33, random_state=53)


## 1. countvectorizer

In [None]:
# CountVectorizer - bag-of-words

# create a count vectorizer and remove stop_words
count_vectorizer = CountVectorizer(stop_words='english')
# create bag-of-words vectors on train/test sets
# fit_transform will create a bag-of-words dictionary 
#  and vectors for each document
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)
# note: if you have unknown words in test set only, may
#  need more data or remove those words from the test dataset

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

## 2. tf-idf vectors

In [None]:
# tfidf vectors for documents
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                   max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

In [None]:
# inspect vectors in pandas dataframe

# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, 
                        columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, 
                        columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


# Tweaking alpha
- try gridsearchCV ?

In [None]:
# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

# Inspect model

In [None]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


# Other considerations
Sentiment Analysis
- complex problems sarcasm
- difficulty with negation
    - ie. I liked it, but it could have been better
- separate communities may use the same words differently
Language biases
- prejudices in text