# Download and import all the necessary libraries

In [1]:
!pip install datasets torch contractions textblob
import pandas as pd
import re, string, unicodedata
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

# Load HuggingFace datasets
from datasets import load_dataset

# Ensure you have the required NLTK resources
import nltk

# Download the Punkt tokenizer models.
# Punkt is a pre-trained tokenizer that divides text into sentences and words.
# This model is useful for tasks such as sentence splitting and word tokenization.
nltk.download('punkt')

# Download a set of common stopwords.
# Stopwords are common words (like "and", "the", "is") that are often
# filtered out in text processing because they are too frequent to
# provide meaningful information. This resource helps with removing or ignoring
# these words during text analysis.
nltk.download('stopwords')

# Download the WordNet lexical database.
# WordNet is a large lexical database of English that groups words into sets of
# synonyms (synsets) and provides various relations between them.
# It is used for tasks like word sense disambiguation, synonyms and
# antonyms finding, and semantic analysis.
nltk.download('wordnet')

# Download the Average Perceptron Tagger for POS Tagging
nltk.download('averaged_perceptron_tagger')

# Download the Maximum Entropy Named Entity Chunker to identify and classify
# named entities in text, such as names of people, organizations, or locations.
nltk.download('maxent_ne_chunker')

# Download a corpus of words that can be used as a reference for various
# NLP tasks, such as checking if a word exists or finding synonyms.
nltk.download('words')

# Import WordCloud to visualize text data
from wordcloud import WordCloud

# Visualization plots library
import matplotlib.pyplot as plt

# Import displacy from spacy library.
# This is to visualize the processed text data post NER and POS Tagging.
import spacy
from spacy import displacy
from spacy.tokens import Span

# Import stopwords
from nltk.corpus import stopwords

# Import Tokenizer
from nltk.tokenize import word_tokenize, sent_tokenize

# Import Lemmatizer
from nltk.stem.wordnet import WordNetLemmatizer

# Visualize data in graphical formats
import matplotlib.pyplot as plt
import seaborn as sns

# To count frequencies of words
from textblob import TextBlob
from collections import Counter

# Ignore the warnings
import warnings
warnings.filterwarnings("ignore")

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-macosx_10_9_universal2.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
Downloading pyahocorasick-2.1.0-cp310-cp310-macosx_10_9_universal2.whl (63 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


[nltk_data] Downloading package punkt to /Users/ravkothu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravkothu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ravkothu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ravkothu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/ravkothu/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /Users/ravkothu/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


ModuleNotFoundError: No module named 'spacy'

# Exploration of SQuAD dataset

## Extract the SQuAD dataset from HuggingFace datasets

In [2]:
squad_dataset = load_dataset("squad")

def extract_squad_info(examples):
    # Combine context and question for the model input
    inputs = [f"Question: {examples['question']} Context: {examples['context']}"]
    return {'input_text': inputs, 'target_text': examples['answers']['text']}

# Preprocess the train and validation sets
train_dataset = squad_dataset['train'].map(extract_squad_info, remove_columns=['question', 'context', 'answers'])
val_dataset = squad_dataset['validation'].map(extract_squad_info, remove_columns=['question', 'context', 'answers'])

print(f"Shape of train dataset: {train_dataset.num_rows}")
print(f"Shape of validation dataset: {val_dataset.num_rows}")

Shape of train dataset: 87599
Shape of validation dataset: 10570


## Convert the extracted dataset to pandas DataFrame

In [3]:
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(val_dataset)

dataset_common_path = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets"
train_df_path = f"{dataset_common_path}/squad_extracted_train.csv"
val_df_path = f"{dataset_common_path}/squad_extracted_validation.csv"

# Save DataFrames to CSV files
train_df.to_csv(train_df_path, index=False)
val_df.to_csv(val_df_path, index=False)

print("CSV files have been created in the following paths:")
print(f"Train dataset - {train_df_path}\n")
print(f"Validation dataset - {val_df_path}")

CSV files have been created in the following paths:
Train dataset - /Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets/squad_extracted_train.csv

Validation dataset - /Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets/squad_extracted_validation.csv


## Merge both train_df and val_df info single dataframe as this is just for preprocessing

In [5]:
merged_squad_df = pd.concat([train_df, val_df])

merged_df_path = f"{dataset_common_path}/squad_extracted_merged.csv"

# Save merged DataFrame to CSV file
merged_squad_df.to_csv(merged_df_path, index=False)

print(f"Merged dataframe has been saved to - {merged_df_path}")

Merged dataframe has been saved to - /Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets/squad_extracted_merged.csv


## Explore the data in general

In [11]:
# Print the columns available in the dataframe
merged_squad_df.columns

Index(['id', 'title', 'input_text', 'target_text'], dtype='object')

In [14]:
# Print the shape of the dataframe
print(f"Shape of the dataframe: {merged_squad_df.shape}")
print(f"Number of rows in dataframe: {merged_squad_df.shape[0]}")
print(f"Number of columns/features in dataframe: {merged_squad_df.shape[1]}")

Shape of the dataframe: (98169, 4)
Number of rows in dataframe: 98169
Number of columns/features in dataframe: 4


In [12]:
# Print the first 15 rows of the dataframe
merged_squad_df.head(15)

Unnamed: 0,id,title,input_text,target_text
0,5733be284776f41900661182,University_of_Notre_Dame,[Question: To whom did the Virgin Mary alleged...,[Saint Bernadette Soubirous]
1,5733be284776f4190066117f,University_of_Notre_Dame,[Question: What is in front of the Notre Dame ...,[a copper statue of Christ]
2,5733be284776f41900661180,University_of_Notre_Dame,[Question: The Basilica of the Sacred heart at...,[the Main Building]
3,5733be284776f41900661181,University_of_Notre_Dame,[Question: What is the Grotto at Notre Dame? C...,[a Marian place of prayer and reflection]
4,5733be284776f4190066117e,University_of_Notre_Dame,[Question: What sits on top of the Main Buildi...,[a golden statue of the Virgin Mary]
5,5733bf84d058e614000b61be,University_of_Notre_Dame,[Question: When did the Scholastic Magazine of...,[September 1876]
6,5733bf84d058e614000b61bf,University_of_Notre_Dame,[Question: How often is Notre Dame's the Juggl...,[twice]
7,5733bf84d058e614000b61c0,University_of_Notre_Dame,[Question: What is the daily student paper at ...,[The Observer]
8,5733bf84d058e614000b61bd,University_of_Notre_Dame,[Question: How many student news papers are fo...,[three]
9,5733bf84d058e614000b61c1,University_of_Notre_Dame,[Question: In what year did the student paper ...,[1987]


In [13]:
# Print the last 15 rows of the dataframe
merged_squad_df.tail(15)

Unnamed: 0,id,title,input_text,target_text
10555,5737a7351c456719005744f5,Force,[Question: What is the force called rgarding a...,"[artifact, artifact of the potential field, an..."
10556,5737a84dc3c5551400e51f59,Force,[Question: What is sometimes impossible to mod...,"[forces, forces as being due to gradient of po..."
10557,5737a84dc3c5551400e51f5a,Force,[Question: Why are some forces due to that are...,"[gradient of potentials, macrophysical conside..."
10558,5737a84dc3c5551400e51f5b,Force,[Question: What do electrostatic gradiient pot...,"[friction, friction, friction, friction]"
10559,5737a84dc3c5551400e51f5c,Force,"[Question: Tension, compression, and drag are ...","[Nonconservative, Nonconservative forces other..."
10560,5737a9afc3c5551400e51f61,Force,[Question: In what treatment are nonconservati...,"[statistical mechanics, statistical mechanics,..."
10561,5737a9afc3c5551400e51f62,Force,[Question: What changes macroscopic closed sys...,"[nonconservative forces, internal energies of ..."
10562,5737a9afc3c5551400e51f63,Force,[Question: What is the exchange of heat associ...,"[nonconservative forces, nonconservative force..."
10563,5737a9afc3c5551400e51f64,Force,[Question: What is the law of thermodynamics a...,"[Second, Second law of thermodynamics, Second ..."
10564,5737a9afc3c5551400e51f65,Force,[Question: What makes energy changes in a clos...,"[nonconservative forces, nonconservative force..."


In [15]:
# Access the first row for sample view of the data
first_row = merged_squad_df.iloc[0]

# Print each value separately
print(f"id: {first_row['id']}\n")
print(f"Title: {first_row['title']}\n")
print(f"Input Text: {first_row['input_text']}\n")
print(f"Target Text: {first_row['target_text']}")

id: 5733be284776f41900661182

Title: University_of_Notre_Dame

Input Text: ['Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? Context: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

Target Text: ['Saint Bernadette Soubirous']


In [16]:
# Fetch information about the dataframe
merged_squad_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98169 entries, 0 to 10569
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           98169 non-null  object
 1   title        98169 non-null  object
 2   input_text   98169 non-null  object
 3   target_text  98169 non-null  object
dtypes: object(4)
memory usage: 3.7+ MB


In [17]:
# Check for missing values in each column
merged_squad_df.isnull().sum()

id             0
title          0
input_text     0
target_text    0
dtype: int64

In [18]:
# Examine the datatypes of each column
merged_squad_df.dtypes

id             object
title          object
input_text     object
target_text    object
dtype: object

In [21]:
# Fetch unique values of each column of object(string) type
for column in merged_squad_df.columns:
    print(f"\nUnique Values in '{column}': {len(merged_squad_df[column].unique())}")


Unique Values in 'id': 98169

Unique Values in 'title': 490


TypeError: unhashable type: 'list'