In [2]:
import fastai

import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import os
import torch
import pandas as pd
import seaborn as sns

from fastai import *

print("Fastai version:", fastai.__version__)
np.set_printoptions(linewidth=130)

Fastai version: 2.7.17


### Check the hardware available for speedup

In [3]:
# Set torch to use cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
# If using CUDA, check the version and which GPU is being used
if device.type == "cuda":
    print("CUDA Version:", torch.version.cuda)
    print("GPU:", torch.cuda.get_device_name(0))

Device: cuda
CUDA Version: 12.4
GPU: NVIDIA GeForce RTX 4080 SUPER


### Loading of data

In [4]:
# Load data
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

### Exploratory data analysis (EDA)

In [9]:
# Check the shape of the data
print('Training Set Shape = {}'.format(train_df.shape))
print('Test Set Shape = {}'.format(test_df.shape))

Training Set Shape = (7613, 5)
Test Set Shape = (3263, 4)


In [13]:
# Check all available columns
train_df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [14]:
# Check the first few rows of the training data
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [15]:
test_df.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [16]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [18]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [21]:
train_df.describe(include='number')

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [22]:
train_df.describe(include='object')

Unnamed: 0,keyword,location,text
count,7552,5080,7613
unique,221,3341,7503
top,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...
freq,45,104,10


In [20]:
test_df.describe(include='number')

Unnamed: 0,id
count,3263.0
mean,5427.152927
std,3146.427221
min,0.0
25%,2683.0
50%,5500.0
75%,8176.0
max,10875.0


In [23]:
test_df.describe(include='object')

Unnamed: 0,keyword,location,text
count,3237,2158,3263
unique,221,1602,3243
top,deluged,New York,11-Year-Old Boy Charged With Manslaughter of T...
freq,23,38,3


In [24]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [25]:
test_df.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64