<a href="https://www.kaggle.com/code/kelixirr/natural-language-processing-with-disaster-tweets?scriptVersionId=144670387" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.target.value_counts()

In [None]:
len(train_df) + len(test_df)

In [None]:
# shuffle the training data 
train_data = train_df.sample(frac =1, random_state = 42)
train_data.head()

In [None]:
# visulize the data
import random
random_index = random.randint(0, len(train_df) - 5)

for row in train_data[["text", "target"]][random_index:random_index + 5].itertuples():
    _,text, target = row
    print(f"Target: {target}", "(real disater)" if target > 0 else "(not real disater)")
    print(f"Text: \n {text} \n")
    print("...\n")
    

In [None]:
# split the data inot training and validation set
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data["text"].to_numpy(),
                                                                           train_data["target"].to_numpy(),
                                                                           test_size =0.1,
                                                                           random_state = 42)

In [None]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

In [None]:
train_sentences[:10]

In [None]:
train_labels[:10]

## Text Vectorization

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
# average number of words or tokens in training data
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

In [None]:
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    output_mode = "int",
                                    output_sequence_length = 15)

In [None]:
# fit the text vectorizer to data
text_vectorizer.adapt(train_sentences)

In [None]:
# checking for the text vectorizer 
random_sentence = random.choice(train_sentences)
print(f"original {random_sentence}")
print(f"Vectorized: {text_vectorizer([random_sentence])}")

In [None]:
# unique words in our dictionary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5 = words_in_vocab[:5]
bottom_5 = words_in_vocab[-5:]

print("Vocab", len(words_in_vocab))
print("Top 5 words:", top_5)
print("Bottom 5 words:", bottom_5)

## Creating Embeddings

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000,
                            output_dim = 128,
                            embeddings_initializer = "uniform",
                            input_length = 15,
                            name = "embedding_1")
embedding

In [None]:
# example from sample data

random_sentence = random.choice(train_sentences)
print(f"original {random_sentence}")
print(f"Vectorized: {text_vectorizer([random_sentence])}")
print(f"Vectorized: {embedding(text_vectorizer([random_sentence]))}")

## Creating Our Models

### 0. Naive Bayes - Baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# create tokenizartiona and modelling the pipeline
model_0 = Pipeline([
    
    ("tfdf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

In [None]:
baseline_score = model_0.score(val_sentences, val_labels)
baseline_score

In [None]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

#### Creating Evaluation functions for our future use

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    
    # accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    
    # model_precision,f1_score, recall using weighted average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    
    model_results = {
        
        "accurracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1
    }
    
    return model_results

In [None]:
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

### Model 1 - Simple Dense

In [None]:
# importing helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

In [None]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [None]:
# create directory to save log 
SAVE_DIR = "model_logs"

In [None]:
# building the model 
inputs = layers.Input(shape=(1,), dtype = "string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name = "model_1_choice")

In [None]:
# compile the model
model_1.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [None]:
model_1.summary()

In [None]:
# fit the model 
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model")])

In [None]:
model_1.evaluate(val_sentences, val_labels)

In [None]:
embedding.weights

In [None]:
embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights.shape)

In [None]:
# make predictions
model_1_preds = tf.squeeze(tf.round(model_1.predict(val_sentences)))
model_1_preds[:10]

In [None]:
# model_1 metrics 
model_1_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_preds)

model_1_results

In [None]:
# comparing results of baseline with new models 

def compare_with_baseline(baseline_results, new_model_results):
    for key, value in baseline_results.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key] - value:.2f}")


In [None]:
compare_with_baseline(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

## Notebook In Progress 
Please note the notebook is in progress. Come back tommorrow for full. 