<a href="https://colab.research.google.com/github/imarariyal/DSPL-ICW/blob/main/DSPL_ICW_DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
#read the csv
df = pd.read_csv('/content/indicators_lka.csv')
df.head()

Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+value+num
1,Sri Lanka,LKA,2022,Fertilizer consumption (% of fertilizer produc...,AG.CON.FERT.PT.ZS,1461.39775839426
2,Sri Lanka,LKA,2021,Fertilizer consumption (% of fertilizer produc...,AG.CON.FERT.PT.ZS,1461.39775839426
3,Sri Lanka,LKA,2020,Fertilizer consumption (% of fertilizer produc...,AG.CON.FERT.PT.ZS,2838.09421995635
4,Sri Lanka,LKA,2019,Fertilizer consumption (% of fertilizer produc...,AG.CON.FERT.PT.ZS,1803.55490605544


In [3]:
df.shape

(76315, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76315 entries, 0 to 76314
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    76315 non-null  object
 1   Country ISO3    76315 non-null  object
 2   Year            76315 non-null  object
 3   Indicator Name  76315 non-null  object
 4   Indicator Code  76315 non-null  object
 5   Value           76315 non-null  object
dtypes: object(6)
memory usage: 3.5+ MB


In [5]:
df.describe()

Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
count,76315,76315,76315,76315,76315,76315
unique,2,2,65,3194,3195,46300
top,Sri Lanka,LKA,2016,Net migration,NV.AGR.TOTL.ZS,0
freq,76314,76314,3029,192,192,2737


In [6]:
#drop row as it is unnecessary
df.drop(index=0, inplace=True)

In [7]:
#removing columns that dont add valuable insights
df.drop(['Country ISO3', 'Indicator Code'], axis=1, inplace=True)

In [8]:
#Rename columns for readability
df.rename(columns={'Country Name': 'Country'}, inplace=True)
df.rename(columns={'Indicator Name': 'Indicator'}, inplace=True)

In [9]:
df.head()

Unnamed: 0,Country,Year,Indicator,Value
1,Sri Lanka,2022,Fertilizer consumption (% of fertilizer produc...,1461.39775839426
2,Sri Lanka,2021,Fertilizer consumption (% of fertilizer produc...,1461.39775839426
3,Sri Lanka,2020,Fertilizer consumption (% of fertilizer produc...,2838.09421995635
4,Sri Lanka,2019,Fertilizer consumption (% of fertilizer produc...,1803.55490605544
5,Sri Lanka,2018,Fertilizer consumption (% of fertilizer produc...,1312.52008353088


In [10]:
#check for null values
df.isnull().sum()

Unnamed: 0,0
Country,0
Year,0
Indicator,0
Value,0


In [11]:
#checking for duplicates
duplicates = df[df.duplicated(subset=['Indicator', 'Year', 'Value'], keep=False)]
print(duplicates.sort_values(by=['Indicator', 'Year']))

         Country  Year                                          Indicator  \
22976  Sri Lanka  2000            Access to electricity (% of population)   
24229  Sri Lanka  2000            Access to electricity (% of population)   
67086  Sri Lanka  2000            Access to electricity (% of population)   
22975  Sri Lanka  2001            Access to electricity (% of population)   
24228  Sri Lanka  2001            Access to electricity (% of population)   
...          ...   ...                                                ...   
61386  Sri Lanka  2020  Women's share of population ages 15+ living wi...   
32733  Sri Lanka  2021  Women's share of population ages 15+ living wi...   
61385  Sri Lanka  2021  Women's share of population ages 15+ living wi...   
32732  Sri Lanka  2022  Women's share of population ages 15+ living wi...   
61384  Sri Lanka  2022  Women's share of population ages 15+ living wi...   

                  Value  
22976              70.3  
24229              70.3

In [12]:
#clean text fields properly again just to be safe
df['Indicator'] = df['Indicator'].str.strip()
df['Country'] = df['Country'].str.strip()

#drop duplicates based ONLY on important columns
df = df.drop_duplicates(subset=['Country','Indicator', 'Year', 'Value'])

#reset index after dropping
df = df.reset_index(drop=True)

#check again
print("Number of duplicates left:", df.duplicated(subset=['Country','Indicator', 'Year', 'Value']).sum())

Number of duplicates left: 0


In [13]:
#data types
df.dtypes

Unnamed: 0,0
Country,object
Year,object
Indicator,object
Value,object


In [14]:
#change the data types
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [15]:
#Keep data from 2000
df = df[df['Year'] >= 2000]

In [16]:
#clean inconsistent values in the Value column
df['Value'] = (
    df['Value']
    .astype(str)                     # ensure it's string
    .str.replace(',', '', regex=False)  # remove commas
    .str.replace('$', '', regex=False)  # remove $ if needed
    .str.strip()                     # remove whitespace
)

# Now convert to numeric
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

In [17]:
df.dropna(subset=['Year', 'Value'], inplace=True)

In [18]:
df.dtypes

Unnamed: 0,0
Country,object
Year,int64
Indicator,object
Value,float64


In [19]:
#Define important indicators
important_indicators = [
    "GDP (current US$)",
    "Inflation, consumer prices (annual %)",
    "Unemployment rate (% of total labor force)",
    "Exports of goods and services (current US$)",
    "Imports of goods and services (current US$)",
    "Foreign direct investment, net inflows (BoP, current US$)",
    "Government expenditure (% of GDP)",
    "Trade in services (% of GDP)",
    "Net migration",
    "Life expectancy at birth, female (years)",
    "Life expectancy at birth, male (years)",
    "Mortality rate, under-5 (per 1,000 live births)",
    "Population growth (annual %)"
]

df = df[df['Indicator'].isin(important_indicators)]

['Mortality rate, under-5 (per 1,000 live births)' 'Net migration'
 'Trade in services (% of GDP)'
 'Foreign direct investment, net inflows (BoP, current US$)'
 'Inflation, consumer prices (annual %)'
 'Exports of goods and services (current US$)'
 'Imports of goods and services (current US$)' 'GDP (current US$)'
 'Life expectancy at birth, female (years)'
 'Life expectancy at birth, male (years)' 'Population growth (annual %)']
        Country  Year                                        Indicator  Value
4079  Sri Lanka  2022  Mortality rate, under-5 (per 1,000 live births)    6.5
4080  Sri Lanka  2021  Mortality rate, under-5 (per 1,000 live births)    6.7
4081  Sri Lanka  2020  Mortality rate, under-5 (per 1,000 live births)    7.0
4082  Sri Lanka  2019  Mortality rate, under-5 (per 1,000 live births)    7.3
4083  Sri Lanka  2018  Mortality rate, under-5 (per 1,000 live births)    7.6


In [20]:
#checking for unique values
df['Indicator'].unique()

array(['Mortality rate, under-5 (per 1,000 live births)', 'Net migration',
       'Trade in services (% of GDP)',
       'Foreign direct investment, net inflows (BoP, current US$)',
       'Inflation, consumer prices (annual %)',
       'Exports of goods and services (current US$)',
       'Imports of goods and services (current US$)', 'GDP (current US$)',
       'Life expectancy at birth, female (years)',
       'Life expectancy at birth, male (years)',
       'Population growth (annual %)'], dtype=object)

In [21]:
#checking for unique values
df['Year'].unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012,
       2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001,
       2000, 2023])

In [22]:
#checking for unique values
df['Value'].unique()

array([ 6.50000000e+00,  6.70000000e+00,  7.00000000e+00,  7.30000000e+00,
        7.60000000e+00,  7.90000000e+00,  8.30000000e+00,  8.80000000e+00,
        9.30000000e+00,  9.90000000e+00,  1.04000000e+01,  1.08000000e+01,
        1.12000000e+01,  2.08000000e+01,  1.20000000e+01,  1.26000000e+01,
        1.33000000e+01,  1.41000000e+01,  2.87000000e+01,  1.51000000e+01,
        1.56000000e+01,  1.61000000e+01,  1.67000000e+01, -2.80110000e+04,
       -2.74350000e+04, -2.95860000e+04, -2.98750000e+04, -2.84260000e+04,
       -2.86070000e+04, -2.23240000e+04, -1.83880000e+04, -2.97970000e+04,
       -3.36720000e+04, -3.77540000e+04, -4.19550000e+04, -9.08390000e+04,
       -9.80600000e+04, -8.11300000e+04, -9.71440000e+04, -1.09695000e+05,
       -1.07045000e+05, -1.18594000e+05, -8.47950000e+04, -1.07573000e+05,
       -1.05718000e+05,  1.92730000e+04,  1.23894000e+05,  8.80557090e+00,
        5.41510512e+00,  3.79595550e+00,  6.22902106e+00,  1.35923419e+01,
        1.37384767e+01,  

In [23]:
#final cleaned df shape and overview
print(df.shape)
print(df.head())

(251, 4)
        Country  Year                                        Indicator  Value
4079  Sri Lanka  2022  Mortality rate, under-5 (per 1,000 live births)    6.5
4080  Sri Lanka  2021  Mortality rate, under-5 (per 1,000 live births)    6.7
4081  Sri Lanka  2020  Mortality rate, under-5 (per 1,000 live births)    7.0
4082  Sri Lanka  2019  Mortality rate, under-5 (per 1,000 live births)    7.3
4083  Sri Lanka  2018  Mortality rate, under-5 (per 1,000 live births)    7.6


In [24]:
#saved the cleaned df as a csv
df.to_csv('cleaned_indicators_lka.csv', index=False)