<a href="https://colab.research.google.com/github/julianencisoizquierdo/NNP-Disaster-Tweets/blob/main/NLP_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'disaster-tweets:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4343103%2F7462099%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240204%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240204T175141Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dbc8ebc6d766f2daf67553cd3f40af2a8291a54fc589622473f6059309a9b90bb9bbb65156ebd7ee37f3deb77eded475e24ab40a9c4c3e8fead59f01f1fed4930b55f1d05f0aefdfa9970ab12b06235077718266652c2a7686ef1b965fe3b57894241c6717087c4cb0820fd66b2dec361252a1eac6ee14bf813b1176c49414edb6464577b8cd455fb86859480b513a09c47eb41f4be6ab9e479173fbaeecad18948390d95391ae0d20fcf5bdc91d2747a361fda528c81f2995d5289618beb00f84d83003607e847fc4ccb511f69a476e80d7318d6b31c384d22f233d5cb40edceebcb52f7386454c648e8f966f8bcf7d24dec612ce0cb50780e4fecf7d7395421,frequency-dictionary:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4361383%2F7490938%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240204%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240204T175141Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0a5322989c9ed5cdb5113a2e4fccf24a18e40799f288e7474daebf18021ac1ce131fd3d13e95e92ed9e8627dca035e97ee36b9bf179dbaaf0a2209aa2804f773120e3a5f70192e0f6ba3bad3efda2adf3c31100baa15d04b964f41748ab8a83b681a4aa2ff331b916d901b1351a4fc7c809134b01e92032ea6ddf7a3d80c9cfa61ecfa52c15a06013ca72aadf17a99d123f62a507fe63b26c19c7c57d9de331fd6bef5703591462cadc11dd0c2e657605e8e928b426c2594c58b73edee4017229558d2fe08ce7fcaad83b8d09919d6de3eee32e580cb046f0a07c2e23c718a77326286b0787400d75ff93295727ead564f76d322e9a37881854994fe8f0b7c00'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = 30
pd.options.display.max_rows = 60
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth = 200

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)



df_train = pd.read_csv('/kaggle/input/disaster-tweets/train.csv')
df_test = pd.read_csv('/kaggle/input/disaster-tweets/test.csv')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))


# DATA EXPLORATION
print('\nInfo about the training set:\n', df_train.info())
print('\nInfo about the test set:\n', df_test.info())


# There are missing values in location and keyboard
per_keyword = df_train['keyword'].isna().sum() / df_train['keyword'].count() * 100
print(f"The percentage of NaN values in 'keyword' column in the training set is: {per_keyword:.2f}%")

per_loc = df_train['location'].isna().sum() / df_train['location'].count() * 100
print(f"The percentage of NaN values in 'location' column in the training set is: {per_loc:.2f}%")

per_keyword_test = df_test['keyword'].isna().sum() / df_test['keyword'].count() * 100
print(f"The percentage of NaN values in 'keyword' column in the test set is: {per_keyword_test:.2f}%")

per_loc_test = df_test['location'].isna().sum() / df_test['location'].count() * 100
print(f"The percentage of NaN values in 'location' column in the test set is: {per_loc_test:.2f}%")


# The amount of missing values are the more or less the same in both the train and the test dataset

df_train["length"] = df_train["text"].apply(lambda x : len(x))
df_test["length"] = df_test["text"].apply(lambda x : len(x))

print("Train Length Stat")
print(df_train["length"].describe())
print()

print("Test Length Stat")
print(df_test["length"].describe())


df_train.head(40)





## **Text Processing**

In [None]:
import re, string

!pip install symspellpy
from symspellpy import SymSpell, Verbosity

import spacy
import os

!pip install sentence_transformers
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

# Lowercasing

df_train['keyword'] = df_train['keyword'].apply(lambda x: str.lower(x) if pd.isna(x) != True else x)
df_train['location'] = df_train['location'].apply(lambda x: str.lower(x) if pd.isna(x) != True else x)
df_train['text'] = df_train['text'].apply(lambda x: str.lower(x))

df_test['keyword'] = df_test['keyword'].apply(lambda x: str.lower(x) if pd.isna(x) != True else x)
df_test['location'] = df_test['location'].apply(lambda x: str.lower(x) if pd.isna(x) != True else x)
df_test['text'] = df_test['text'].apply(lambda x: str.lower(x))



# Punctuation removal

def remove_ent(text):
    ent_prefixes = ['@', '#']
    for separator in string.punctuation:
        if separator not in ent_prefixes:
            text = text.replace(separator, ' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in ent_prefixes:
                words.append(word)
    return ' '.join(words)

df_train['keyword'] = df_train['keyword'].apply(lambda x: remove_ent(x) if pd.isna(x) != True else x)
df_train['location'] = df_train['location'].apply(lambda x: remove_ent(x) if pd.isna(x) != True else x)
df_train['text'] = df_train['text'].apply(lambda x: remove_ent(x))

df_test['keyword'] = df_test['keyword'].apply(lambda x: remove_ent(x) if pd.isna(x) != True else x)
df_test['location'] = df_test['location'].apply(lambda x: remove_ent(x) if pd.isna(x) != True else x)
df_test['text'] = df_test['text'].apply(lambda x: remove_ent(x))


# Spelling Correction

sym_spell = SymSpell()

dictionary_path = '/kaggle/input/frequency-dictionary/frequency_dictionary_en_82_765.txt'

sym_spell.load_dictionary(dictionary_path, 0, 1)

def spelling_correction(sent):
    doc_w_cor_spelling = []
    for tok in sent.split(' '):

        x = sym_spell.lookup(tok, Verbosity.CLOSEST, max_edit_distance = 2, include_unknown=True)[0].__str__()
        y = x.split(',')[0]
        doc_w_cor_spelling.append(y)

    return " ".join(doc_w_cor_spelling)

df_train['keyword'] = df_train['keyword'].apply(lambda x: spelling_correction(x) if pd.isna(x) != True else x)
df_train['location'] = df_train['location'].apply(lambda x: spelling_correction(x) if pd.isna(x) != True else x)
df_train['text'] = df_train['text'].apply(lambda x: spelling_correction(x))

df_test['keyword'] = df_test['keyword'].apply(lambda x: spelling_correction(x) if pd.isna(x) != True else x)
df_test['location'] = df_test['location'].apply(lambda x: spelling_correction(x) if pd.isna(x) != True else x)
df_test['text'] = df_test['text'].apply(lambda x: spelling_correction(x))

df_train.head(40)



# Keyword extraction

os.system('python -m spacy download en')
nlp = spacy.load('en_core_web_sm')

def extract_keywords(nlp=nlp, doc="", no_of_keywords=5, model=model):

    doc = doc.lower()
    doc = re.sub(r'?:\@|http?\://|https?\://|www\S+', ' ', doc)
    doc = re.sub(r'[^\w\s]', ' ', doc)
    doc = re.sub(' \d+', ' ', doc)

    doc_ = nlp(doc)


    pos_tag = ['VERB', 'NOUN', 'AJD', 'PROPN']
    result = []

    for token in doc_:
        if (token.pos_ in pos_tag):
            result.append(token.text)

doc_embedding = model.encode([doc])
results_embeddings = model.encode(result)


distances = cosine_similarity(doc_embedding, results_embeddings)


keywords = [result[index] for index in distances.argsort()[0][-no_of_keywords:]]

return keywords

# https://medium.com/analytics-vidhya/introduction-to-nlp-with-disaster-tweets-3b672a75748c

In [None]:
sample = pd.read_csv('/kaggle/input/disaster-tweets/sample_submission.csv')


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

test.info()