## This is a dataset describing some important indicators of a country. We will try to understand this data and discover some interesting things. So let's dive into the analysis.

In [53]:
#  importing libraries

import pandas as pd               # dealing with the data
import seaborn as sns             # for visualisation
import numpy as np                # for numerical computation
import matplotlib.pyplot as plt   # for visualisation

# for showing plot in jupyter notebook
%matplotlib inline

In [54]:
# loading and reading data

data_path = r"C:\Users\Asus\OneDrive\Documents\Data science\Datasets\countries-of-the-world\countries of the world.csv"
data = pd.read_csv(data_path)

In [55]:
# seeing the data

data.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,0,2306,16307,700.0,360,32,1213,22,8765,1,466,2034,38.0,24.0,38.0
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,232.0,188.0,579.0
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,4,-39,31,6000.0,700,781,322,25,9653,1,1714,461,101.0,6.0,298.0
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,0,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,


In [56]:
data.columns

Index(['Country', 'Region', 'Population', 'Area (sq. mi.)',
       'Pop. Density (per sq. mi.)', 'Coastline (coast/area ratio)',
       'Net migration', 'Infant mortality (per 1000 births)',
       'GDP ($ per capita)', 'Literacy (%)', 'Phones (per 1000)', 'Arable (%)',
       'Crops (%)', 'Other (%)', 'Climate', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service'],
      dtype='object')

In [57]:
data.dtypes

Country                                object
Region                                 object
Population                              int64
Area (sq. mi.)                          int64
Pop. Density (per sq. mi.)             object
Coastline (coast/area ratio)           object
Net migration                          object
Infant mortality (per 1000 births)     object
GDP ($ per capita)                    float64
Literacy (%)                           object
Phones (per 1000)                      object
Arable (%)                             object
Crops (%)                              object
Other (%)                              object
Climate                                object
Birthrate                              object
Deathrate                              object
Agriculture                            object
Industry                               object
Service                                object
dtype: object

## Check for missing values

In [58]:

for col in data.columns:
    print(col  , (data[col].isnull().sum()/len(data[col])*100 ))
    

Country 0.0
Region 0.0
Population 0.0
Area (sq. mi.) 0.0
Pop. Density (per sq. mi.) 0.0
Coastline (coast/area ratio) 0.0
Net migration 1.3215859030837005
Infant mortality (per 1000 births) 1.3215859030837005
GDP ($ per capita) 0.4405286343612335
Literacy (%) 7.929515418502203
Phones (per 1000) 1.762114537444934
Arable (%) 0.881057268722467
Crops (%) 0.881057268722467
Other (%) 0.881057268722467
Climate 9.691629955947137
Birthrate 1.3215859030837005
Deathrate 1.762114537444934
Agriculture 6.607929515418502
Industry 7.048458149779736
Service 6.607929515418502


## As we can see that maximum percentage of missing values is 9.7 

In [59]:
data = data.fillna(0)

In [60]:
for col in data.columns:
    print(col  , (data[col].isnull().sum()/len(data[col])*100 ))

Country 0.0
Region 0.0
Population 0.0
Area (sq. mi.) 0.0
Pop. Density (per sq. mi.) 0.0
Coastline (coast/area ratio) 0.0
Net migration 0.0
Infant mortality (per 1000 births) 0.0
GDP ($ per capita) 0.0
Literacy (%) 0.0
Phones (per 1000) 0.0
Arable (%) 0.0
Crops (%) 0.0
Other (%) 0.0
Climate 0.0
Birthrate 0.0
Deathrate 0.0
Agriculture 0.0
Industry 0.0
Service 0.0


## As the names of some columns are not favourable with the typing purpose, so let's make them favourable

In [69]:
new_column_name = {'Area (sq. mi.)':'Area' , 'Pop. Density (per sq. mi.)':'Pop_density' , 
                  'Coastline (coast/area ratio)':'Coastline' , 
                  'Infant mortality (per 1000 births)':'Infant_mortality' , 'GDP ($ per capita)':'GDP_per_capita' ,
                  'Literacy (%)':'Literacy_percent' , 'Phones (per 1000)':'Phones_per_k' , 'Arable (%)':'Arable' ,
                   'Crops (%)':'Crops' ,'Other (%)':'Other'}
data = data.rename(columns = new_column_name )

In [102]:
data.head()

Unnamed: 0,Country,Region,Population,Area,Pop_density,Coastline,Net migration,Infant_mortality,GDP_per_capita,Literacy_percent,Phones_per_k,Arable,Crops,Other,Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0,2306,16307,700.0,360,32,1213,22,8765,1,466,2034,38,24,38
1,Albania,EASTERN EUROPE,3581655,28748,124.6,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,232,188,579
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,4,-39,31,6000.0,700,781,322,25,9653,1,1714,461,101,6,298
3,American Samoa,OCEANIA,57794,199,290.4,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,0,0,0
4,Andorra,WESTERN EUROPE,71201,468,152.1,0,66,405,19000.0,1000,4972,222,0,9778,3,871,625,0,0,0


## Now we can see there are many numeric values in various columns which are not in the correct format. So let's make them correct.

In [113]:
def replace_commas(columns):
    for col in columns:
        data[col] = data[col].astype(str)
        dat = []
        for val in data[col]:
            val = val.replace(',' , '.')
            val = float(val)
            dat.append(val)

        data[col] = dat
    return(data.head())

In [114]:
columns = data[['Pop_density' , 'Coastline' , 'Net migration' , 'Infant_mortality' , 
                   'Literacy_percent' , 'Phones_per_k' , 'Arable' , 'Crops' , 'Other' , 'Birthrate' , 'Deathrate' , 'Agriculture' ,
                   'Industry' , 'Service']]
replace_commas(columns)

Unnamed: 0,Country,Region,Population,Area,Pop_density,Coastline,Net migration,Infant_mortality,GDP_per_capita,Literacy_percent,Phones_per_k,Arable,Crops,Other,Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1,46.6,20.34,0.38,0.24,0.38
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3,15.11,5.22,0.232,0.188,0.579
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,3.22,0.25,96.53,1,17.14,4.61,0.101,0.6,0.298
3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.0,15.0,75.0,2,22.46,3.27,0.0,0.0,0.0
4,Andorra,WESTERN EUROPE,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,2.22,0.0,97.78,3,8.71,6.25,0.0,0.0,0.0
