# Data Inspection and Data Cleaning

## Tasks
- Import Dataset
- Understand/Inspect Dataset
- Data Cleaning

## Import Libraries

In [1]:
import pandas as pd  # data manipulation, cleaning and analysis
import numpy as np   # numerical analysis
import matplotlib.pyplot as plt  # data visualization
import seaborn as sns  # data visualization

from pathlib import Path


import warnings
warnings.filterwarnings("ignore")

## Import Dataset

In [2]:
RAW_DATASET_DIR = Path("../dataset/Algerian_forest_fires.csv")
CLEANED_DATASET_DIR = Path("../dataset/Algerian_forest_fires_CLEANED.csv")

In [3]:
df = pd.read_csv(RAW_DATASET_DIR)
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


## Inspect Dataset
- observe if there is 
    - any inconsistency
    - missing values
    - duplicates
    - shape, distribution

In [4]:
# display first 5 records
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


In [5]:
df.tail()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
238,26,9,2012,30,65,14,0.0,85.4,16.0,44.5,4.5,16.9,6.5,fire,1
239,27,9,2012,28,87,15,4.4,41.1,6.5,8.0,0.1,6.2,0.0,not fire,1
240,28,9,2012,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,not fire,1
241,29,9,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,1
242,30,9,2012,24,64,15,0.2,67.3,3.8,16.5,1.2,4.8,0.5,not fire,1


In [6]:
df.sample(3)

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
189,8,8,2012,37,56,11,0.0,87.4,11.2,20.2,5.2,11.0,5.9,fire,1
145,24,6,2012,35,68,16,0.0,85.3,10.0,17.0,4.9,9.9,5.3,fire,1
15,16,6,2012,29,89,13,0.7,36.1,1.7,7.6,0.0,2.2,0.0,not fire,0


In [7]:
df.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region'],
      dtype='object')

In [8]:
# rename all colums to upper casing using list comprehension
df.columns = [col.upper() for col in df.columns]

In [9]:
df.head()

Unnamed: 0,DAY,MONTH,YEAR,TEMPERATURE,RH,WS,RAIN,FFMC,DMC,DC,ISI,BUI,FWI,CLASSES,REGION
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0


In [10]:
df.shape

(243, 15)

In [11]:
no_of_dup = df.duplicated().sum()
print(f"The number of duplicate found is: {no_of_dup}")

The number of duplicate found is: 0


In [12]:
# Generate report for missing values

no_of_miss = df.isnull().sum()
percent_of_miss = df.isnull().sum() / len(df)

missing_report = {
    "Missing Count": no_of_miss,
    "Missing %": percent_of_miss
}

pd.DataFrame(missing_report)

Unnamed: 0,Missing Count,Missing %
DAY,0,0.0
MONTH,0,0.0
YEAR,0,0.0
TEMPERATURE,0,0.0
RH,0,0.0
WS,0,0.0
RAIN,0,0.0
FFMC,0,0.0
DMC,0,0.0
DC,0,0.0


In [13]:
# check the info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DAY          243 non-null    int64  
 1   MONTH        243 non-null    int64  
 2   YEAR         243 non-null    int64  
 3   TEMPERATURE  243 non-null    int64  
 4   RH           243 non-null    int64  
 5   WS           243 non-null    int64  
 6   RAIN         243 non-null    float64
 7   FFMC         243 non-null    float64
 8   DMC          243 non-null    float64
 9   DC           243 non-null    float64
 10  ISI          243 non-null    float64
 11  BUI          243 non-null    float64
 12  FWI          243 non-null    float64
 13  CLASSES      243 non-null    object 
 14  REGION       243 non-null    int64  
dtypes: float64(7), int64(7), object(1)
memory usage: 28.6+ KB


In [14]:
def unique_features(cols: list) -> str:
    for col in cols:
        print(f"Feature Name: {col}")
        print(f"Unique Values {df[col].unique()}")
        print("="*30)

columns = df.columns

unique_features(cols=columns)



Feature Name: DAY
Unique Values [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
Feature Name: MONTH
Unique Values [6 7 8 9]
Feature Name: YEAR
Unique Values [2012]
Feature Name: TEMPERATURE
Unique Values [29 26 25 27 31 33 30 28 32 34 35 36 37 22 24 38 39 40 42]
Feature Name: RH
Unique Values [57 61 82 89 77 67 54 73 88 79 65 81 84 78 80 55 62 66 64 53 47 50 68 75
 76 63 69 70 59 48 45 60 51 52 58 86 74 71 49 44 41 42 90 87 72 46 37 36
 56 43 83 29 34 33 35 39 31 21 40 24 38 26]
Feature Name: WS
Unique Values [18 13 22 16 14 15 12 19 21 20 17 26 11 10  9  8  6 29]
Feature Name: RAIN
Unique Values [ 0.   1.3 13.1  2.5  0.2  1.2  0.5  3.1  0.7  0.6  0.3  0.1  0.4  1.
  1.4  0.8 16.8  7.2 10.1  3.8  0.9  1.8  4.6  8.3  5.8  4.   2.   4.7
  8.7  4.5  1.1  1.7  2.2  6.   1.9  2.9  4.1  6.5  4.4]
Feature Name: FFMC
Unique Values [65.7 64.4 47.1 28.6 64.8 82.6 88.2 86.6 52.9 73.2 84.5 84.  50.  59.
 49.4 36.1 37.3 56.9 79.9 59.8 81.  79.1 81.4 8

In [15]:
# clean the class features
df["CLASSES"] = df["CLASSES"].str.strip()

In [16]:
df["CLASSES"].unique()

array(['not fire', 'fire'], dtype=object)

In [17]:
df["CLASSES"].value_counts()

CLASSES
fire        137
not fire    106
Name: count, dtype: int64

In [18]:
# display all region 0 (Bejaia region)

df[df["REGION"] == 0]

Unnamed: 0,DAY,MONTH,YEAR,TEMPERATURE,RH,WS,RAIN,FFMC,DMC,DC,ISI,BUI,FWI,CLASSES,REGION
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,26,9,2012,31,54,11,0.0,82.0,6.0,16.3,2.5,6.2,1.7,not fire,0
118,27,9,2012,31,66,11,0.0,85.7,8.3,24.9,4.0,9.0,4.1,fire,0
119,28,9,2012,32,47,14,0.7,77.5,7.1,8.8,1.8,6.8,0.9,not fire,0
120,29,9,2012,26,80,16,1.8,47.4,2.9,7.7,0.3,3.0,0.1,not fire,0


**Observation**
- There are 122 records of Bejaia Region

In [19]:
# save the cleaned data frame
df.to_csv(CLEANED_DATASET_DIR, index=False)