### Introduction to NLP and Transformers: Building a News Sentiment Classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Flatten, SimpleRNN
import keras
from keras.models import Sequential
from keras.utils import to_categorical
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

### Keras and Text

In [19]:
sents = ['This is the first', 'this is the second', 'ich heisse Jacob, und ich wohne im New York']

In [20]:
#instantiate the tokenizer
tokenizer = Tokenizer(num_words = 20)

In [21]:
#fit on texts
tokenizer.fit_on_texts(sents)

In [22]:
#create sequences based on fit
tokenizer.texts_to_sequences(sents)

[[1, 2, 3, 5], [1, 2, 3, 6], [4, 7, 8, 9, 4, 10, 11, 12, 13]]

In [23]:
#print sentences
sents

['This is the first',
 'this is the second',
 'ich heisse Jacob, und ich wohne im New York']

In [24]:
#one hot encode with to_matrix
tokenizer.texts_to_matrix(sents)

array([[0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        0., 0., 0., 0.]])

In [25]:
#examine the word_index
tokenizer.word_index

{'this': 1,
 'is': 2,
 'the': 3,
 'ich': 4,
 'first': 5,
 'second': 6,
 'heisse': 7,
 'jacob': 8,
 'und': 9,
 'wohne': 10,
 'im': 11,
 'new': 12,
 'york': 13}

### A Basic Neural Network Approach

In [30]:
news = pd.read_csv('data/all-data.csv',  encoding = 'latin-1', header = None)

In [31]:
news.columns = ['sentiment', 'headline']

In [32]:
news.head()

Unnamed: 0,sentiment,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [33]:
tokenizer = Tokenizer(num_words = 1000)

In [34]:
#pass the numpy array as input
tokenizer.fit_on_texts(news['headline'].values)

In [35]:
dtm = tokenizer.texts_to_matrix(news['headline'].values)

In [36]:
model = Sequential()
model.add(Dense(32, input_shape = (1000,)))
model.add(Dense(3, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', metrics = ['acc'])

In [46]:
#define y
y = news['sentiment']
def sentimenter(x):
    d = {'negative': 0, 'neutral': 1, 'positive': 2}
    return d[x]

In [47]:
y = y.apply(sentimenter)

In [48]:
# y[:10]

In [50]:
y = to_categorical(y)

In [51]:
news.shape

(4846, 2)

In [52]:
#split the data
x_train, x_test, y_train, y_test = train_test_split(dtm, y, random_state = 22)

In [53]:
#fit the model
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs = 10)

Epoch 1/10


ValueError: Creating variables on a non-first call to a function decorated with tf.function.

In [None]:
#examine performance


### Word Embeddings

> *Word embedding is any of a set of language modeling and feature learning techniques in natural language processing (NLP) where words or phrases from the vocabulary are mapped to vectors of real numbers. Conceptually it involves a mathematical embedding from a space with many dimensions per word to a continuous vector space with a much lower dimension.* -- [Wikipedia](https://en.wikipedia.org/wiki/Word_embedding)

In [None]:
#using the embedding layer


In [None]:
#fit the model


In [None]:
#plot results


### Sequential Models

In [None]:
#using a SimpleRNN


In [None]:
#fit the model


In [None]:
#examine performance


### More Transfer Learning: SpaCy Transformers with Keras

In [None]:
import spacy

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
sentence = 'My dog sleeps on the bed.'

In [None]:
#turn into nlp object


In [None]:
#examine some attributes


In [None]:
#doc.vector


### Pretrained Models

![](https://1.bp.blogspot.com/-RLAbr6kPNUo/W9is5FwUXmI/AAAAAAAADeU/5y9466Zoyoc96vqLjbruLK8i_t8qEdHnQCLcBGAs/s640/image3.png)

In [None]:
#!python -m spacy download en_trf_bertbaseuncased_lg 

In [None]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")

In [None]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0]))  # 0.43365782

In [None]:
doc1 = nlp('My dogs sleep on the bed.')
doc2 = nlp('My dogs eat dinner in the garage.')
doc3 = nlp('My dogs are ride or die.')

In [None]:
for token in doc3:
    print(token.text, token.pos_, token.tag_, token.is_stop)

In [None]:
#check similarity


In [None]:
#doc1 doc3 similarity


### Training a Model

In [None]:
#simple network model


#compile


#fit


In [None]:
#examine the loss


### Keras and BERT

In [None]:
#!pip install spacy-transformers

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
#a custom transformer class

        

In [None]:
#make a pipeline


In [None]:
#train/test split the sklearn way


In [None]:
#fit it


In [None]:
#score it


### Trying other estimators

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#make a pipeline


In [None]:
#fit it


In [None]:
#train score


In [None]:
#test score


### Keras and sklearn

In [None]:
from keras.callbacks import CSVLogger

In [None]:
hist_callback = CSVLogger('test.csv', append = True)

In [None]:
#add a callback


#define a network


#create the keras regressor


In [None]:
#make a pipeline


In [None]:
#fit the pipeline


In [None]:
#score train


In [None]:
#score the test
