## Basic EDA and converting excel to csv file format

In [276]:
# We start with importing the right libraries. 
# In this paritcular case only the pandas library is enough 
import pandas as pd

# Then we read the excel data using pandas read_excel function using file name and sheet name as the parameters
excel_data = pd.read_excel('xlsx.xlsx', sheet_name = 'Basic Indicators')

# Next we print the first 5 rows of the data to check data sanity/consistency 
excel_data.head()

Unnamed: 0.1,Unnamed: 0,TABLE 1. BASIC INDICATORS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,,,,,,,,,,,...,,,,,,,,,,
1,,Countries and areas,Under–5 mortality rank,Under–5 mortality rate,,Under–5 mortality rate by sex\n2016,,Infant mortality rate (under 1),,Neonatal mortality rate,...,Total adult literacy rate (%),,Primary school net enrolment ratio ...,,,,,,,
2,,,,1990,2016.0,male,female,1990,2016.0,2016,...,2011−2016*,,2011−2016*,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,0.0
4,,Afghanistan,25,177,70.0,74,66,120,53.0,40,...,31.7411,,–,,,,,,,


In [277]:
# The first 5 rows of the data shows 4 unnecessary rows that can be dropped
# But before we act on that, let's also check the last 5 rows of the data
excel_data.tail()

Unnamed: 0.1,Unnamed: 0,TABLE 1. BASIC INDICATORS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
240,,"Under-5, infant and neonatal mortality rates –...",,,,,,,,,...,,,,,,,,,,
241,,Total population and births – United Nations P...,,,,,,,,,...,,,,,,,,,,
242,,Under-five deaths – United Nations Inter-agenc...,,,,,,,,,...,,,,,,,,,,
243,,Life expectancy at birth – United Nations Popu...,,,,,,,,,...,,,,,,,,,,
244,,Total adult literacy rate and primary school e...,,,,,,,,,...,,,,,,,,,,


In [278]:
# Like the top rows, the bottom rows show inconsistency as well
# When we checked the original excel file, we saw there are particulary 39 rows of data that need to removed

# In order to remove the unnecessary rows, let's first count the total number of rows in the original data
numRows = len(excel_data)

# Now, let's drop the last 39 rows using drop function 
excel_data.drop(excel_data.index[numRows-39:numRows], inplace=True)

# Next, let's drop the top 4 rows using the same drop function
excel_data.drop(excel_data.index[:4], inplace=True)

#Let's first print the top row data to ensure we have removed the unnecessary rows
excel_data.head()

Unnamed: 0.1,Unnamed: 0,TABLE 1. BASIC INDICATORS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
4,,Afghanistan,25,177,70,74,66,120,53,40,...,31.7411,,–,,,,,,,
5,,Albania,114,40,14,15,12,35,12,6,...,97.247,,95.5173,,,,,,,
6,,Algeria,78,49,25,27,24,41,22,16,...,75.136,x,97.0622,,,,,,,
7,,Andorra,179,9,3,3,3,7,2,1,...,100.0,,–,,,,,,,
8,,Angola,17,221,83,88,76,131,55,29,...,66.0301,,84.0123,,,,,,,


In [279]:
# Next up check the bottom rows to ensure we have removed the unnecessary bottom rows
excel_data.tail()

Unnamed: 0.1,Unnamed: 0,TABLE 1. BASIC INDICATORS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
201,,Venezuela (Bolivarian Republic of),100,30,16,18,15,25,14,10,...,97.1271,,89.8568,,,,,,,
202,,Viet Nam,82,51,22,25,18,37,17,12,...,93.5204,x,97.9651,,,,,,,
203,,Yemen,38,126,55,59,51,88,43,27,...,–,,84.8373,,,,,,,
204,,Zambia,33,182,63,68,58,110,44,23,...,83.0077,x,87.4041,,,,,,,
205,,Zimbabwe,37,75,56,62,51,50,40,23,...,88.6934,,85.8624,,,,,,,


In [280]:
# Now that the rows are cleaned up, let's focus on the unnecessary columns
# First we see an unnamed: 0 column that needs to removed
# Before removing it, let's rename it a new column name we know, in order to ensure we don't drop something else  
excel_data = excel_data.rename(columns = {'Unnamed: 0': 'To be Dropped'})

# Now we drop that column (that we just renamed)
excel_data.drop(['To be Dropped'], axis = 1, inplace=True)

# And print the top rows to ensure we have the right cleaned up data (so far)
excel_data.head()

Unnamed: 0,TABLE 1. BASIC INDICATORS,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,...,31.7411,,–,,,,,,,
5,Albania,114,40,14,15,12,35,12,6,2926.35,...,97.247,,95.5173,,,,,,,
6,Algeria,78,49,25,27,24,41,22,16,40606.1,...,75.136,x,97.0622,,,,,,,
7,Andorra,179,9,3,3,3,7,2,1,77.281,...,100.0,,–,,,,,,,
8,Angola,17,221,83,88,76,131,55,29,28813.5,...,66.0301,,84.0123,,,,,,,


In [281]:
# Next up, let's clean up the first column name and rename it to Country Name
excel_data = excel_data.rename(columns = {'TABLE 1. BASIC INDICATORS': 'Country Name'})

# And print the top rows again to ensure we have the right cleaned up data (so far)
excel_data.head()

Unnamed: 0,Country Name,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,...,31.7411,,–,,,,,,,
5,Albania,114,40,14,15,12,35,12,6,2926.35,...,97.247,,95.5173,,,,,,,
6,Algeria,78,49,25,27,24,41,22,16,40606.1,...,75.136,x,97.0622,,,,,,,
7,Andorra,179,9,3,3,3,7,2,1,77.281,...,100.0,,–,,,,,,,
8,Angola,17,221,83,88,76,131,55,29,28813.5,...,66.0301,,84.0123,,,,,,,


In [282]:
# Next we see rename the column names
# In order do that, we run a for loop on the columns to rename them according to the requirements

for x in range(2, len(excel_data.columns)+1):
    excel_data = excel_data.rename(columns = {'Unnamed: ' + str(x) : str(x-2)})
    
# And print the top rows to ensure we have the right cleaned up data (so far)
excel_data.head()

Unnamed: 0,Country Name,0,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,21
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,...,31.7411,,–,,,,,,,
5,Albania,114,40,14,15,12,35,12,6,2926.35,...,97.247,,95.5173,,,,,,,
6,Algeria,78,49,25,27,24,41,22,16,40606.1,...,75.136,x,97.0622,,,,,,,
7,Andorra,179,9,3,3,3,7,2,1,77.281,...,100.0,,–,,,,,,,
8,Angola,17,221,83,88,76,131,55,29,28813.5,...,66.0301,,84.0123,,,,,,,


In [283]:
# Then we see unnecessary columns on the rightside of the dataframe
# First we drop column 13
excel_data.drop(['13'], axis = 1, inplace=True)

# And print the top rows to ensure we have the right cleaned up data (so far)
excel_data.head()

Unnamed: 0,Country Name,0,1,2,3,4,5,6,7,8,...,11,12,14,15,16,17,18,19,20,21
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,...,63.673,31.7411,–,,,,,,,
5,Albania,114,40,14,15,12,35,12,6,2926.35,...,78.345,97.247,95.5173,,,,,,,
6,Algeria,78,49,25,27,24,41,22,16,40606.1,...,76.078,75.136,97.0622,,,,,,,
7,Andorra,179,9,3,3,3,7,2,1,77.281,...,–,100.0,–,,,,,,,
8,Angola,17,221,83,88,76,131,55,29,28813.5,...,61.547,66.0301,84.0123,,,,,,,


In [284]:
# Then we drop all the columns from 15 to 22, which are unnecessary 
for x in range(15, 22):
    excel_data.drop([str(x)], axis = 1, inplace=True)

# And print the top rows to ensure we have the right cleaned up data (so far)    
excel_data.head()

Unnamed: 0,Country Name,0,1,2,3,4,5,6,7,8,9,10,11,12,14
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,1142.96,80,63.673,31.7411,–
5,Albania,114,40,14,15,12,35,12,6,2926.35,34.75,0,78.345,97.247,95.5173
6,Algeria,78,49,25,27,24,41,22,16,40606.1,949.277,24,76.078,75.136,97.0622
7,Andorra,179,9,3,3,3,7,2,1,77.281,–,0,–,100.0,–
8,Angola,17,221,83,88,76,131,55,29,28813.5,1180.97,96,61.547,66.0301,84.0123


In [285]:
# One last change: the column name from "14" to "13"
excel_data = excel_data.rename(columns = {'14':'13'})

# And print the top rows to ensure we have the right cleaned up data (so far)
excel_data.head()

Unnamed: 0,Country Name,0,1,2,3,4,5,6,7,8,9,10,11,12,13
4,Afghanistan,25,177,70,74,66,120,53,40,34656.0,1142.96,80,63.673,31.7411,–
5,Albania,114,40,14,15,12,35,12,6,2926.35,34.75,0,78.345,97.247,95.5173
6,Algeria,78,49,25,27,24,41,22,16,40606.1,949.277,24,76.078,75.136,97.0622
7,Andorra,179,9,3,3,3,7,2,1,77.281,–,0,–,100.0,–
8,Angola,17,221,83,88,76,131,55,29,28813.5,1180.97,96,61.547,66.0301,84.0123


In [286]:
# Now that the data is ready to put into CSV, we just do one last check on the bottom rows
excel_data.tail()

Unnamed: 0,Country Name,0,1,2,3,4,5,6,7,8,9,10,11,12,13
201,Venezuela (Bolivarian Republic of),100,30,16,18,15,25,14,10,31568.2,602.103,10,74.545,97.1271,89.8568
202,Viet Nam,82,51,22,25,18,37,17,12,94569.1,1581.76,34,76.253,93.5204,97.9651
203,Yemen,38,126,55,59,51,88,43,27,27584.2,867.303,48,64.953,–,84.8373
204,Zambia,33,182,63,68,58,110,44,23,16591.4,620.261,39,61.874,83.0077,87.4041
205,Zimbabwe,37,75,56,62,51,50,40,23,16150.4,535.37,30,61.163,88.6934,85.8624


In [287]:
# Finally we save the cleaned up data in a CSV file
excel_data.to_csv('excel.csv' ,  index = False)