<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/NLP_30days.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets gensim huggingface_hub gdown

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/519.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/519.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.9 MB/s[0m eta [36m0:

# Basic vectorization of Bag of words

Learned how to create word vector base on it's occurence with your documents.

https://medium.com/mlearning-ai/nlp-day-6-dont-forget-your-bag-of-words-80286c12e26e

In [None]:
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter

docs = [
    "Jack be nimble",
    "Jack be quick",
    "Jack jump over",
    "The candlestick"
]

tokenizer = TreebankWordTokenizer()

doc_tokens = []
for doc in docs:
    doc_tokens += tokenizer.tokenize(doc.lower())


# Creates a lexicon or the vocabulary base on the seen documents.
lexicon = sorted(set(doc_tokens))
print(lexicon)

['be', 'candlestick', 'jack', 'jump', 'nimble', 'over', 'quick', 'the']


In [None]:
doc_vectors = []

for doc in docs:
    # Initialize zero-vector for each lexicon for each doc
    vector = {token:0 for token in lexicon}

    # Tokenize the doc and create bag of words for it.
    tokens = tokenizer.tokenize(doc.lower())
    # phrase: "Jack be nimble"
    # Counter({'jack': 1, 'be': 1, 'nimble': 1})
    bow = Counter(tokens)

    # Then base on the vector we initializes we compute the vector of each doc.
    for key, value in bow.items():
        vector[key] = value / len(lexicon)

    doc_vectors.append(
        np.array([value for value in vector.values()])
    )

print(doc_vectors)

[array([0.125, 0.   , 0.125, 0.   , 0.125, 0.   , 0.   , 0.   ]), array([0.125, 0.   , 0.125, 0.   , 0.   , 0.   , 0.125, 0.   ]), array([0.   , 0.   , 0.125, 0.125, 0.   , 0.125, 0.   , 0.   ]), array([0.   , 0.125, 0.   , 0.   , 0.   , 0.   , 0.   , 0.125])]


With documents have vectors you can now compute similarity.

In [None]:
def get_cos_similarity(v1:np.array, v2:np.array) -> float:
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

v0 = doc_vectors[0]
for idx, comp_vec in enumerate(doc_vectors[1:]):
    cos_sim = get_cos_similarity(v0, comp_vec)
    print(f"Cosine similarity v0-v{idx+1}: {cos_sim}")

# >>>
# Cosine similarity v0-v1: 0.6666666666666667
# Cosine similarity v0-v2: 0.33333333333333337
# Cosine similarity v0-v3: 0.0

Cosine similarity v0-v1: 0.6666666666666667
Cosine similarity v0-v2: 0.33333333333333337
Cosine similarity v0-v3: 0.0


# Topic modelling with help of TF-IDF

https://medium.com/mlearning-ai/nlp-day-7-your-story-your-topic-your-tf-idf-7c06c9c1196a

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import normalize
from nltk.tokenize import TreebankWordTokenizer

corpus = [
    'Hey diddle, diddle,',
    'The cow jumped over the moon.',
    'The little dog laughed to see such sport,',
    'and the dish ran away with the spoon. '
]

tokenizer = TreebankWordTokenizer()

lexicon = []
for doc in corpus:
    lexicon += tokenizer.tokenize(doc.lower())

lexicon = sorted(set([token for token in lexicon if token not in '-.,!?']))
print(lexicon)

['and', 'away', 'cow', 'diddle', 'dish', 'dog', 'hey', 'jumped', 'laughed', 'little', 'moon', 'over', 'ran', 'see', 'spoon', 'sport', 'such', 'the', 'to', 'with']


In [None]:
# Initialize a DF
# Rows will be each doc and initialized by 0
# Columns will be each words (in vocabulary/lexicon)
df_tf = pd.DataFrame(
    data=0,
    # Create a rows indexes base on # of curpose
    index=[i for i in range(len(corpus))],
    #lex
    columns=lexicon
)
df_tf

Unnamed: 0,and,away,cow,diddle,dish,dog,hey,jumped,laughed,little,moon,over,ran,see,spoon,sport,such,the,to,with
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Compute the terms frequency (base on TF computation)
for idx, doc in enumerate(corpus):
    tokens = tokenizer.tokenize(doc.lower())
    bag_of_words = Counter(tokens)
    for col in df_tf.columns:
        # creates vector for each docs and it's associated word/token
        df_tf.loc[idx, col] = bag_of_words[col] / len(lexicon)


# Compute the IDF (base on TF-IDF computation)
num_documents = len(corpus)
df_idf = np.log((1 + num_documents) / (1 + np.sum(df_tf, axis=0))) + 1

In [None]:
# complete TF-IDF
df_tf_idf = df_tf * df_idf


# Note: We apply the normalizing in order to compare our solution later to the sklearn’s implementation
# normalization to ignore length of the documents while still retaining the frequency.
df_tf_idf_norm = pd.DataFrame(
    data=normalize(df_tf_idf.to_numpy(), 'l2').round(2),
    columns=lexicon
)

df_tf_idf_norm

Unnamed: 0,and,away,cow,diddle,dish,dog,hey,jumped,laughed,little,moon,over,ran,see,spoon,sport,such,the,to,with
0,0.0,0.0,0.0,0.89,0.0,0.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.37,0.0,0.0,0.37,0.37,0.0,0.0,0.0,0.0,0.0,0.68,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.36,0.36,0.0,0.0,0.0,0.36,0.0,0.36,0.36,0.33,0.36,0.0
3,0.32,0.32,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.0,0.32,0.0,0.0,0.61,0.0,0.32


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Generate matrix on which categories are very similar to each other (check 7, 2 = 0.45 because they are both refusal)
cosine_sim = cosine_similarity(df_tf_idf_norm, df_tf_idf_norm)
cosine_sim_df = pd.DataFrame(cosine_sim)
cosine_sim_df.head(100)

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.22151,0.415556
2,0.0,0.22151,1.0,0.201061
3,0.0,0.415556,0.201061,1.0


# Doing LSA with PCA

https://medium.com/mlearning-ai/nlp-day-9-performing-latent-semantic-analysis-with-pca-4d360621e5cd

- TFIDF alone is not enough to get Topics in the given document.
- PCA can help to generate `Topic vector`

In [None]:
import pandas as pd
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("sms_spam")

Downloading builder script:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

In [None]:
from nltk.tokenize import casual_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=casual_tokenize)
# Convert to pandas so we can feed to TFIDF vectorizer
dataset_df = dataset['train'].to_pandas()
texts = dataset_df['sms']

X = vectorizer.fit_transform(texts)

tf_idf = pd.DataFrame(
    # for dataframe we need to convert it to numpy matrix
    data=X.todense(),
    columns=vectorizer.get_feature_names_out()
)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=16, random_state=42)

df_pca = pd.DataFrame(
    # Generate topic vectors (with 16 dims)
    data=pca.fit_transform(tf_idf),
    columns=[f"topic_{n}" for n in range(pca.n_components_)]
)

# resort tf_idf vocabulary
vocabulary = vectorizer.vocabulary_
_ , terms = zip(*sorted(zip(vocabulary.values(), vocabulary.keys())))

# create dataframe from weights
df_weights = pd.DataFrame(
    # Shape of(topics_length, terms_length)
    data=pca.components_,
    columns=terms,
    index=[f"topic_{n}" for n in range(pca.n_components_)]
)

# making sense of topics
# list of spam keywords
spam_terms = 'win winner won chance cash bonus prize reward congrats'.split(' ')
# give more weights to topics with more spam terms
# We do this just for identifying.
df_spam_wins = df_weights[spam_terms].round(4) * 100
# Sort which topic has the most spam terms.
df_spam_wins.sum(axis=1).sort_values(ascending=False)

In [None]:
pca.components_.shape

(16, 8961)

# CNN with NLP

https://medium.com/mlearning-ai/nlp-day-12-get-your-words-in-order-with-convolutional-neural-networks-part-2-68974b205a6



In [None]:
import gdown
import gzip
import shutil

# Download word2vec
def download_from_google_drive(
  url: str = "https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM",
  dest_dir: str = ".",
  dest_file: str = "GoogleNews-vectors-negative300.bin.gz",
) -> str:
  return gdown.download(url, output=f"{dest_dir}/{dest_file}", quiet=False)

def unzip(file_path):
  with gzip.open(file_path, "rb") as f_in:
      with open(file_path.replace(".gz", ""), "wb") as f_out:
          shutil.copyfileobj(f_in, f_out)

file_path = download_from_google_drive()
unzip(file_path)

Downloading...
From: https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/GoogleNews-vectors-negative300.bin.gz
100%|██████████| 1.65G/1.65G [00:15<00:00, 105MB/s]


In [None]:
import pandas as pd
from datasets import load_dataset

# Load dataset from the hub
# only get train
dataset = load_dataset("imdb", split='train[:10%]')

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

tokenizer = TreebankWordTokenizer()
word_vectors = KeyedVectors.load_word2vec_format(
    './GoogleNews-vectors-negative300.bin',
    binary=True, limit=200000
)

MAX_LENGTH = 400

# Naive implementation of tokenizer (this will simply pad and truncate base on maxlength)
def tokenizer_with_padding(text):
  tokens = tokenizer.tokenize(text)
  vectors = []

  for token in tokens:
    try:
        vectors.append(word_vectors[token])
    except KeyError:
        # Ignore OOV (out of vocab)
        pass

  # zeros_like will be able to append vectors safely
  # because it will match the existing dtype of the vector
  zero_vector = np.zeros_like(vectors[0])
  # pad token and truncate (using tokenizer this is done automatically for us)
  if len(vectors) < MAX_LENGTH:
      add_ele = MAX_LENGTH - len(vectors)
      for _ in range(add_ele):
          vectors.append(zero_vector)

  # Always return with max length
  return vectors[:MAX_LENGTH]



def vectorize_data(datasets):
    text = datasets['text']
    label = datasets['label']

    vectorized_data = []
    target_labels = []

    for sample in text:
        vectors = tokenizer_with_padding(sample)
        vectorized_data.append(vectors)

    return { "input_ids": vectorized_data, "labels": label }

tokenized_datasets = dataset.map(vectorize_data, batched=True).shuffle(seed=42)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
from sklearn.model_selection import train_test_split

X = tokenized_datasets['input_ids']
y = tokenized_datasets['labels']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    maxlen=400,
    embedding_dims=300
)

In [None]:
import tensorflow as tf
from tensorflow import keras

inputs = keras.Input(shape=(400, 300))

x = keras.layers.Conv1D(
    # Number of filters
    250,
    # kernel size
    3,
    padding='valid',
    activation='relu',
    strides=1
)(inputs)

# We need to learn more about layers.
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dense(250, activation='relu')(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs, outputs, name='cnn_imdb')

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)