In [None]:
import pandas as pd

print("Step 1: Loading the MAIB incident reports dataset from JSONL format")
print("=" * 60)

df = pd.read_json(
    '../data/maib-incident-reports-dataset.jsonl',
    lines=True
)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
df.head()


Unnamed: 0,text,label
0,A chemical/products tanker parted a mooring li...,Damage / Loss Of Equipment
1,A cruise ship passenger fell out of bed result...,Accident to person(s)
2,Cruise ship's passenger tripped and sustained ...,Accident to person(s)
3,A ro-pax crew member sustained a torn rotator ...,Accident to person(s)
4,Container ship's crew member sustained an open...,Accident to person(s)


In [None]:
print("\nStep 2: Examining the last 5 rows of the dataset")
print("=" * 50)
print("Last 5 rows of the dataset:")
df.tail()

Unnamed: 0,text,label
5763,A ro-pax vessel crew member slipped and fell r...,Accident to person(s)
5764,Passenger vessel coming alongside ripped a cle...,Damage / Loss Of Equipment
5765,Pilot vessel made contact with a hopper dredge...,Collision
5766,A racing catamaran started taking on water dur...,Flooding / Foundering
5767,A replenishment tanker experienced a laundry r...,Fire / Explosion


In [None]:
print("\nStep 3: Getting the total number of records in the dataset")
print("=" * 55)
print(f"Total number of incident reports: {len(df)}")

5768


In [None]:
print("\nStep 4: Analyzing dataset structure and data types")
print("=" * 50)
print("Dataset information including column names, data types, and memory usage:")
frame = pd.DataFrame(df)
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5768 entries, 0 to 5767
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5768 non-null   object
 1   label   5768 non-null   object
dtypes: object(2)
memory usage: 90.3+ KB


In [None]:
print("\nStep 5: Analyzing label distribution in the dataset")
print("=" * 50)
print("Counting occurrences and calculating percentages for each incident type:")

counts = df['label'].value_counts()
percentages = df['label'].value_counts(normalize=True) * 100

summary = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
})

print("\nLabel distribution summary:")
print(summary)

                            Count  Percentage
label                                        
Accident to person(s)        1662       28.81
Damage / Loss Of Equipment   1375       23.84
Loss Of Control               823       14.27
Grounding / Stranding         561        9.73
Contact                       406        7.04
Collision                     326        5.65
Fire / Explosion              324        5.62
Flooding / Foundering         187        3.24
Capsizing / Listing            99        1.72
Hull Failure                    3        0.05
Non-accidental Event            2        0.03


In [None]:
print("\nStep 6: Checking for duplicate rows in the entire dataset")
print("=" * 55)
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Any duplicate rows: 0


In [None]:
print("\nStep 7: Checking for duplicate text content in the 'text' column")
print("=" * 60)
print(f"Number of duplicate text entries: {df['text'].duplicated().sum()}")

Duplicates in 'text' column: 0


In [None]:
print("\nStep 8: Data exploration completed!")
print("=" * 35)
print("Summary of findings:")
print(f"- Total records: {len(df)}")
print(f"- Dataset shape: {df.shape}")
print(f"- Columns: {list(df.columns)}")
print(f"- Duplicate rows: {df.duplicated().sum()}")
print(f"- Duplicate text entries: {df['text'].duplicated().sum()}")
print(f"- Number of unique incident types: {df['label'].nunique()}")

# convert the notebook to a script
# jupyter nbconvert --to script maib-incident-data-analysis.ipynb