## Importing Data & Libraries <a class="anchor"  id="chapter3"></a>

In [1]:
# Importing necessary libraries

# Data cleaning, EDA, and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
!pip install spellchecker
!pip install pyspellchecker
from spellchecker import SpellChecker

# For building NN with BERT, RoBERTa, and metadata
import torch 
import tensorflow as tf
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.metrics import confusion_matrix
import torch.nn as nn
import torch.nn.functional as F
!pip install torchsummary
from torchinfo import summary
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.optim as optim
from sklearn.model_selection import train_test_split
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
import random
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Set cuDNN flags for deterministic behavior and disable benchmarking
torch.backends.cudnn.deterministic = True    # Ensure deterministic convolution algorithms are used
torch.backends.cudnn.benchmark = False    # Disable automatic tuning of convolution algorithms

# Checks if GPU is available for use 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Collecting spellchecker
  Downloading spellchecker-0.4.tar.gz (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting inexactsearch
  Downloading inexactsearch-1.0.2.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting soundex>=1.0
  Downloading soundex-1.1.3.tar.gz (9.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting silpa_common>=0.3
  Downloading silpa_common-0.3.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: spellchecker, inexactsearch, silpa_common, soundex
  Building wheel for spellchecker (setup.py) ... [?25ldone
[?25h  Created wheel for spellchecker: filename=spellchecker-0.4-py3-none-any.whl size=3966514 sha256=f491e0491c2da1ad5718a30276a6777d73232af87993f189f921109086b0837d
  Stored in directory: /root/.cache/pip/whe

In [2]:
# Import datasets
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')


train.to_csv('train.csv')
test.to_csv('test.csv')
# Combine for cleaning and text pre-processing
df = pd.concat([train, test], axis=0)

# Inspecting data
display(df.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0
