### **Import necessary Libraries**

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer

### **Import Data**

In [2]:
original = pd.read_csv('/content/sample_data/country_based_data.csv')

### **Observe the Data**

In [3]:
original.shape

(12622, 51)

In [None]:
original.isnull().sum()

A lot of Null values found in all the columns

### **Make a Copy of the original Data**

In [5]:
data_copy = original.copy()

In [6]:
data_copy.drop('Unnamed: 0',axis='columns',inplace=True)

### **Check the Datatypes of columns**

In [None]:
data_copy.dtypes

### **Check number of instances of each country**

In [None]:
for country_name in list(set(data_copy['country'])):
    a = data_copy[data_copy['country'] == country_name].shape
    print(a)

Turkey and Czech Republic have incomplete data (Later we find out that all columns are empty, therefore we delete the data of these countries)

### **Replace columns that are completely Null with 0 for each country seperately**

In [9]:
data_copy = data_copy[~data_copy['country'].isin(['Turkey','Czech Republic'])]

for country_name in set(data_copy['country']):
    data_concerned = data_copy[data_copy['country'] == country_name]
    all_null = data_concerned.columns[data_concerned.isnull().sum() == 63]
    data_copy.loc[data_copy['country'] == country_name, all_null] = data_copy.loc[data_copy['country'] == country_name, all_null].fillna(0)


We fill 0 for columns that are completely 0 for every country to prevent any issue during KNN Imputer, knn Imputing is done seperately for each country  to prevent imputer taking data of seperate countries into consideration.

In [10]:
data_copy

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,...,multidimensional_poverty_headcount_ratio%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides
0,Afghanistan,1960-01-01,,,,,,,,,...,,0.0,50.340,31.921,32.535,8622466.0,7898093.0,,,
1,Afghanistan,1961-01-01,57.801696,,652230.0,327.0,,,,,...,,0.0,50.443,31.349,33.068,8790140.0,8026804.0,,,
2,Afghanistan,1962-01-01,57.893688,,652230.0,327.0,,,,,...,,0.0,50.570,30.845,33.547,8969047.0,8163985.0,,,
3,Afghanistan,1963-01-01,57.970348,,652230.0,327.0,,,,,...,,0.0,50.703,30.359,34.016,9157465.0,8308019.0,,,
4,Afghanistan,1964-01-01,58.066940,,652230.0,327.0,,,,,...,,0.0,50.831,29.867,34.494,9355514.0,8458694.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12617,Zimbabwe,2018-01-01,41.876696,45.332093,386850.0,657.0,4.469742,-1.227581,0.125622,45.400288,...,0.0,,32.074,7.972,61.414,15052184.0,10204026.0,-1.140975,0.123371,4.876369
12618,Zimbabwe,2019-01-01,41.876696,45.213002,386850.0,657.0,6.927164,-1.273280,0.135450,46.682095,...,0.0,50.3,31.518,8.043,61.292,15354608.0,10408889.0,-1.164705,0.118156,5.145035
12619,Zimbabwe,2020-01-01,41.876696,45.093912,386850.0,657.0,5.118949,-1.289440,0.142061,52.747667,...,0.0,,31.009,8.132,61.124,15669666.0,10617452.0,-1.113716,0.120647,4.977770
12620,Zimbabwe,2021-01-01,,,,,,-1.257897,0.154067,48.979927,...,0.0,,30.537,9.057,59.253,15993524.0,10827136.0,-1.136934,0.121119,6.139985


In [None]:
data_copy.isnull().sum()

### **Imputing using KNN Imputer for Missing Values**

In [22]:
imputing_columns = data_copy.drop(['country', 'date'], axis='columns').columns

data_cleaned = pd.DataFrame(columns=list(imputing_columns) + ['Country', 'year'])  # Corrected column assignment

for country_name in set(data_copy['country']):
    data_imputed = KNNImputer(n_neighbors=3).fit_transform(data_copy[data_copy['country'] == country_name][imputing_columns])
    imputed_df = pd.DataFrame(data_imputed, columns=imputing_columns)
    imputed_df['Country'] = country_name
    imputed_df['year'] = pd.to_datetime(data_copy.loc[data_copy['country'] == country_name, 'date']).dt.year.values
    data_cleaned = pd.concat([data_cleaned,imputed_df], ignore_index=True)

In [25]:
data_cleaned

Unnamed: 0,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,renewvable_energy_consumption%,electric_power_consumption,...,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides,Country,year
0,2.928752,3.540453,71020.0,78.0,0.000000,0.043600,0.285655,100.000000,0.063333,1058.971352,...,41.814000,15.546,48.811,133426.0,35358.0,-0.481803,0.228029,0.726992,United Arab Emirates,1960
1,2.928752,3.540453,71020.0,78.0,0.000000,0.043600,0.285655,100.000000,0.063333,1058.971352,...,41.392000,14.963,49.695,140984.0,36116.0,-0.481803,0.228029,0.726992,United Arab Emirates,1961
2,2.928752,3.540453,71020.0,78.0,0.000000,0.043600,0.285655,100.000000,0.063333,1058.971352,...,41.062000,14.290,50.686,148877.0,36850.0,-0.481803,0.228029,0.726992,United Arab Emirates,1962
3,2.928752,3.540453,71020.0,78.0,0.000000,0.043600,0.285655,100.000000,0.063333,1058.971352,...,40.601000,13.610,51.584,157006.0,37535.0,-0.481803,0.228029,0.726992,United Arab Emirates,1963
4,2.928752,3.540453,71020.0,78.0,0.000000,0.043600,0.285655,100.000000,0.063333,1058.971352,...,40.003000,12.667,52.848,165305.0,38154.0,-0.481803,0.228029,0.726992,United Arab Emirates,1964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12595,20.625754,43.266442,472710.0,1604.0,12.062652,-1.144107,0.125622,62.200000,78.570000,266.456697,...,36.716000,8.379,61.180,25076747.0,10939982.0,-1.133223,0.123371,3.990775,Cameroon,2018
12596,20.625754,43.147977,472710.0,1604.0,12.674736,-1.194021,0.132429,63.176895,78.790000,266.456697,...,36.147000,8.160,61.584,25782341.0,11094657.0,-1.209367,0.117767,3.990775,Cameroon,2019
12597,20.625754,43.029511,472710.0,1604.0,9.383638,-1.114791,0.135698,64.311874,78.940000,266.456697,...,35.507000,8.419,60.833,26491087.0,11242817.0,-1.205425,0.120301,4.537375,Cameroon,2020
12598,20.625754,43.147977,472710.0,1604.0,8.691039,-1.092714,0.146758,65.446709,78.766667,266.456697,...,34.938000,8.582,60.333,27198628.0,11383170.0,-1.160156,0.120698,3.990775,Cameroon,2021


In [26]:
data_cleaned.isnull().sum()

agricultural_land%                           0
forest_land%                                 0
land_area                                    0
avg_precipitation                            0
trade_in_services%                           0
control_of_corruption_estimate               0
control_of_corruption_std                    0
access_to_electricity%                       0
renewvable_energy_consumption%               0
electric_power_consumption                   0
CO2_emisions                                 0
other_greenhouse_emisions                    0
population_density                           0
inflation_annual%                            0
real_interest_rate                           0
risk_premium_on_lending                      0
research_and_development_expenditure%        0
central_goverment_debt%                      0
tax_revenue%                                 0
expense%                                     0
goverment_effectiveness_estimate             0
goverment_eff

### **Exporting the Cleaned Data**

In [27]:
data_cleaned.to_csv('country_data_cleaned.csv')