In [57]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf

### What is Natural Language Processing (NLP) 

> Natural language processing, which evolved from computational linguistics, uses methods from various disciplines, such as computer science, artificial intelligence, linguistics, and data science, to enable computers to understand human language in both written and verbal forms. 
[IBM](https://www.ibm.com/blogs/watson/2020/11/nlp-vs-nlu-vs-nlg-the-differences-between-three-natural-language-processing-concepts/)

The usage of computers to have them the ability to understand both written and verbal forms in this case, text.
Types of NLP:
* Speech Recognition
* Machine Translation
* Sentiment Analysis
* Semantic Search


### Loading and Parsing through the Dataset

In [2]:
test_df = pd.read_csv('./NLP-NLU-Files/Dataset/Disaster-Tweets/test.csv')
train_df = pd.read_csv('./NLP-NLU-Files/Dataset/Disaster-Tweets/train.csv')

#### Visualizing Tweets

In [36]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [37]:
train_df_shuffled = train_df.sample(frac=1).reset_index(drop=True)

In [38]:
### Get a random sample set

sample_set = train_df_shuffled[['text', 'target']]

random_n = random.randint(0, len(sample_set)-5)

for row in sample_set[random_n: random_n+5].itertuples():
    _, text, target = row
    print(f'Target: {target}', '(real_disaster)' if target > 0 else '(not real disaster)')
    print(f'Text:\n{text}\n')
    print("---\n")

Target: 0 (not real disaster)
Text:
@minsuwoongs i completely understand because i just woke up like 15 minutes ago and im Burning

---

Target: 0 (not real disaster)
Text:
im feeling attacked http://t.co/91jvYCxXVi

---

Target: 1 (real_disaster)
Text:
@mylittlepwnies3 @Early__May @AnathemaZhiv @TonySandos much of which has to do with lebanon 80s attack/ iran hostage crisis/ Libya Pan am

---

Target: 0 (not real disaster)
Text:
The Twitter update pretty much wrecked the app

---

Target: 0 (not real disaster)
Text:
Season 3 of New Girl was such a emotional train wreck I just wanted to cry laugh and eat a lot of ice cream

---



### Splitting the Train Data by Train-Test-Split

In [39]:
X = train_df_shuffled['text'].to_numpy()
y = train_df_shuffled['target'].to_numpy()

In [41]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=42
)

In [42]:
len(train_sentences),len(train_labels), len(val_sentences),len(val_labels)

(6851, 6851, 762, 762)

In [56]:
for i in range(0,5):
    print(f'{train_labels[i]}', 'real disaster' if train_labels[i] > 0 else 'not real disaster', 
          f'\n{train_sentences[i]}\n')

1 real disaster 

0 not real disaster 
@WoundedPigeon http://t.co/s9soAeVcVo Detonate by @ApolloBrown ft. M.O.P.

0 not real disaster 
@CaraJDeIevingnc the bomb impact ratio hit beyond kyle js

0 not real disaster 
Your brain is particularly vulnerable to trauma at two distinct ages http://t.co/KnBv2YtNWc @qz @TaraSwart @vivian_giang

0 not real disaster 
i lava you! ????



### Building Blocks

---
**Tokenization** is straight up mapping the words without any weight/values, just regular numerical encoding  
  
* **Word-Level Tokenization** - maps the whole text and maps each word; thus it(the word) is considered a token. (e.q. one-hot encoding)
* **Character-Level Tokenization** - maps the whole text but focuses on each letter from 1 - 26; thus it(each letter) is considered a token.  
* **Sub-word Tokenization** - takes the syllables of a word and tokenizes it 

**Embedding** uses vector weights that can be learned as out network trains. i.e. individual letters in a word on how each letter can be of importance to create that word.

---

🔑 **Takeaways**  
  
**Process**  

* **Build a Text Vectorizer**  
    * **`max_vocab_length`** - 1000  
    * **`max_length`** - taken from the length of each word in a sentence, summed together  
and divided by the total amount of train_sentences  `sum([len(i.split()) for i in train_]) / len(train_)`
 

* **Build a Embedding Layer**
    * **`input_dim`** - same as `max_vocab_length`  
    * **`output_dim`** - any number divisible by `8`  
    * **`input_length`** - same as `max_length`  
    
     
> You must vectorize your text before feeding it to the embedding layer

In [140]:
sum([len(i.split()) for i in train_sentences])/len(train_sentences)

14.882206977083637

In [68]:
max_vocab_length = 10000
max_length = 15

#### Token Vectorizer

In [90]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=max_length,
    pad_to_max_tokens=True
)

In [91]:
text_vectorizer.adapt(train_sentences)

In [95]:
random_sentence = random.choice(train_sentences)
print(f'Original test:\n{random_sentence}')
text_vectorizer([random_sentence])

Original test:
@joshcorman  #infosec rather you knew it or not your a firefighter  now days  you often  run into burning buildings Deal with it.


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1, 5331, 1270,   12, 1825,   15,   58,   32,   34,    3, 1686,
          51,  603,   12, 2860]], dtype=int64)>

In [96]:
words_in_vocab = text_vectorizer.get_vocabulary()
words_in_vocab[:5],words_in_vocab[-5:]

(['', '[UNK]', 'the', 'a', 'in'],
 ['pakistan\x89Ûªs', 'pakistans', 'pajamas', 'paints', 'painthey'])

#### Embedding Layer

In [102]:
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                      output_dim=128,
                                      input_length=max_length,
                                      embeddings_initializer='uniform'
                                     )
embedding

<keras.layers.embeddings.Embedding at 0x233a57f2820>

In [103]:
print(f'Original test:\n{random_sentence}')
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original test:
@joshcorman  #infosec rather you knew it or not your a firefighter  now days  you often  run into burning buildings Deal with it.


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.0110671 ,  0.01732009,  0.04375197, ..., -0.04271836,
          0.00399405,  0.04798069],
        [-0.00202902,  0.03009815,  0.0060215 , ...,  0.00019274,
         -0.03650288,  0.03767592],
        [-0.04458491,  0.04364324,  0.01128941, ...,  0.00490395,
          0.02786065,  0.04195031],
        ...,
        [-0.00765001,  0.0353638 , -0.03011203, ..., -0.0300968 ,
         -0.03033456, -0.04726434],
        [-0.01424288, -0.03628627,  0.00607703, ..., -0.00190689,
         -0.0046855 , -0.04964775],
        [ 0.01280308, -0.04025245, -0.02600848, ..., -0.04596686,
         -0.04381627, -0.01347705]]], dtype=float32)>

In [115]:
sample_embed[0][0], random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.0110671 ,  0.01732009,  0.04375197,  0.00603743, -0.04594326,
         0.01795593,  0.01034095, -0.04193724,  0.00294954, -0.03191074,
        -0.01556091, -0.0371348 , -0.03070192,  0.00728117, -0.04708997,
         0.02752427,  0.02307996, -0.02495719, -0.04091208,  0.00255727,
         0.0403378 , -0.00195117,  0.04263123, -0.02346913, -0.00544274,
        -0.04831278, -0.04629358, -0.00087645,  0.03791757, -0.01223205,
        -0.04184105,  0.04388804, -0.04266124, -0.00613172,  0.0448084 ,
         0.03379716,  0.04483197,  0.00587719,  0.03099189, -0.01697315,
         0.035551  , -0.04736337,  0.02922549,  0.04674867,  0.02014044,
        -0.02329115,  0.00762113, -0.04035987, -0.04018734,  0.02938296,
         0.00176369,  0.01719406,  0.01451891, -0.03192464, -0.00386583,
        -0.00457022, -0.02242173,  0.00988829, -0.03559873,  0.03894741,
         0.01715103,  0.01431768, -0.01823659,  0.0296546 , -0.04988586,
  