In [509]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
import seaborn as sns
import category_encoders as ce
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
warnings.filterwarnings('ignore')
%matplotlib inline


In [510]:
df = pd.read_csv('nigeria-population-1991-2016.csv')
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,Total (%),%,1991,100.0
1,Nigeria,Population of Nigeria,Total (%),%,2006,100.0
2,Nigeria,Population of Nigeria,Total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,Total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,Total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74 YEARS,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79 YEARS,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84 YEARS,Persons,2006,12813.0
566,Nigeria,Separated,85+ YEARS,Persons,2006,11483.0


In [511]:
df.describe()

Unnamed: 0,Date,Value
count,568.0,568.0
mean,2005.653169,8847953.0
std,2.788448,28877420.0
min,1991.0,4.7
25%,2006.0,35330.25
50%,2006.0,322067.0
75%,2006.0,3088511.0
max,2016.0,193392500.0


In [512]:
df.shape

(568, 6)

In [513]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   state      568 non-null    object 
 1   indicator  568 non-null    object 
 2   item       568 non-null    object 
 3   Unit       568 non-null    object 
 4   Date       568 non-null    int64  
 5   Value      568 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 26.8+ KB


In [514]:
df.isnull().sum()

state        0
indicator    0
item         0
Unit         0
Date         0
Value        0
dtype: int64

In [515]:
df.state.unique()

array(['Nigeria'], dtype=object)

Population of Nigeria: The total population of Nigeria.

Rural: The number of people living in rural areas in Nigeria.

Total: The total population of Nigeria (same as "Population of Nigeria").

Urban: The number of people living in urban areas in Nigeria.

Female: The number of females in Nigeria.

Sex Ratio (%): The percentage of males in Nigeria, calculated as (male population / total population) * 100.

Male: The number of males in Nigeria.

Projected Population of Nigeria: The projected total population of Nigeria for a specific time period, based on certain assumptions.

Never Married: The number of people in Nigeria who have never been married.

Married: The number of people in Nigeria who are currently married.

Divorced: The number of people in Nigeria who are divorced.

Separated: The number of people in Nigeria who are separated but not divorced.

Widowed: The number of people in Nigeria who are widowed.

Both Sexes: The total population of Nigeria (same as "Population of Nigeria").

Head of Household: The number of people in Nigeria who are the head of their household.

Spouse: The number of people in Nigeria who are a spouse in a household.

Child: The number of people in Nigeria who are a child in a household.

Parent: The number of people in Nigeria who are a parent in a household.

Brother / Sister: The number of people in Nigeria who are a brother or sister in a household.

Other Blood Relation: The number of people in Nigeria who are a blood relative other than a sibling in a household.

Non Blood Relationship: The number of people in Nigeria who are in a non-blood relationship in a household.

Institutional Household: The number of people in Nigeria who live in an institutional household, such as a prison or nursing home.

In [516]:
df.indicator.unique()

array(['Population of Nigeria', 'Rural', 'Total', 'Urban', 'Female',
       'Sex Ratio (%)', 'Male', 'Projected Population of Nigeria',
       'Never Married', 'Married', 'Divorced', 'Separated', 'Widowed',
       'Both Sexes', 'Head of Household', 'Spouse', 'Child', 'Parent',
       'Brother / Sister', 'Other Blood Relation',
       'Non Blood Relationship', 'Institutional Household'], dtype=object)

In [517]:
df_pivot = df.pivot_table(index="Date", columns="indicator", values="Value", aggfunc="sum")
df_pivot


indicator,Both Sexes,Brother / Sister,Child,Divorced,Female,Head of Household,Institutional Household,Male,Married,Never Married,...,Parent,Population of Nigeria,Projected Population of Nigeria,Rural,Separated,Sex Ratio (%),Spouse,Total,Urban,Widowed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991,,,,,44462612.0,,,44529608.0,,,...,,177984621.0,,195.3,,100.15,,186.4,184.2,
2006,280863580.0,9510590.0,149692172.0,2814456.0,138172604.0,50193022.0,10405110.0,142690976.0,195697268.0,181750692.0,...,2674316.0,280863780.0,,,3366752.0,,40386076.0,664558321.0,,7697204.0
2007,,,,,,,,,,,...,,144998281.0,289851215.0,,,,,,,
2008,,,,,,,,,,,...,,149713264.0,299126454.0,,,,,,,
2009,,,,,,,,,,,...,,154581566.0,308698500.0,,,,,,,
2010,,,,,,,,,,,...,,159608173.0,,,,,,,,
2011,,,,,,,,,,,...,,164798232.0,,,,,,,,
2012,,,,,,,,,,,...,,170157060.0,,,,,,,,
2013,,,,,,,,,,,...,,175690143.0,,,,,,,,
2014,,,,,,,,,,,...,,181403148.0,,,,,,,,


In [518]:
df['indicator'] = df['indicator'].replace('Sex Ratio (%)', 'Sex Ratio')
df['indicator'] = df['indicator'].replace('Brother / Sister', 'Brother and Sister')

In [519]:
df.indicator.unique()

array(['Population of Nigeria', 'Rural', 'Total', 'Urban', 'Female',
       'Sex Ratio', 'Male', 'Projected Population of Nigeria',
       'Never Married', 'Married', 'Divorced', 'Separated', 'Widowed',
       'Both Sexes', 'Head of Household', 'Spouse', 'Child', 'Parent',
       'Brother and Sister', 'Other Blood Relation',
       'Non Blood Relationship', 'Institutional Household'], dtype=object)

In [520]:
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,Total (%),%,1991,100.0
1,Nigeria,Population of Nigeria,Total (%),%,2006,100.0
2,Nigeria,Population of Nigeria,Total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,Total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,Total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74 YEARS,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79 YEARS,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84 YEARS,Persons,2006,12813.0
566,Nigeria,Separated,85+ YEARS,Persons,2006,11483.0


In [521]:
df.item.unique()

array(['Total (%)', 'Total', 'Female (%)', 'Female', 'Male (%)', 'Male',
       '15 - 19 YEARS', '20 - 24 YEARS', '25 - 29 YEARS', '30 - 34 YEARS',
       '35 - 39 YEARS', '40 - 44 YEARS', '45 - 49 YEARS', '50 - 54 YEARS',
       '55 - 59 YEARS', '60 - 64 YEARS', '65 - 69 YEARS', '70 - 74 YEARS',
       '75 - 79 YEARS', '80 - 84 YEARS', '85+ YEARS', '10 - 14 YEARS',
       '5 - 9 YEARS', '0 - 4 YEARS'], dtype=object)

In [522]:
df['item'] = df['item'].replace('Total (%)', 'Total')
df['item'] = df['item'].replace('Female (%)', 'Female')
df['item'] = df['item'].replace('Male (%)', 'Male')

In [523]:
df['item'] = [age.replace('YEARS', '') for age in df.item]

In [524]:
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,Total,%,1991,100.0
1,Nigeria,Population of Nigeria,Total,%,2006,100.0
2,Nigeria,Population of Nigeria,Total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,Total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,Total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [525]:
df['item'] = [age.strip().lower() for age in df.item]

In [526]:
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,total,%,1991,100.0
1,Nigeria,Population of Nigeria,total,%,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [527]:
df.item.unique()

array(['total', 'female', 'male', '15 - 19', '20 - 24', '25 - 29',
       '30 - 34', '35 - 39', '40 - 44', '45 - 49', '50 - 54', '55 - 59',
       '60 - 64', '65 - 69', '70 - 74', '75 - 79', '80 - 84', '85+',
       '10 - 14', '5 - 9', '0 - 4'], dtype=object)

In [528]:
df.item.unique()

array(['total', 'female', 'male', '15 - 19', '20 - 24', '25 - 29',
       '30 - 34', '35 - 39', '40 - 44', '45 - 49', '50 - 54', '55 - 59',
       '60 - 64', '65 - 69', '70 - 74', '75 - 79', '80 - 84', '85+',
       '10 - 14', '5 - 9', '0 - 4'], dtype=object)

In [529]:
df.Unit.unique()

array(['%', 'Persons'], dtype=object)

In [530]:
df['Unit'] = df['Unit'].replace('%', 'Percentage')
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [531]:
df['Value'].value_counts()

50.0          4
16135950.0    3
2066247.0     3
14899419.0    3
13435079.0    3
             ..
3203747.0     1
2154183.0     1
910812.0      1
321886.0      1
841538.0      1
Name: Value, Length: 494, dtype: int64

In [532]:
df

Unnamed: 0,state,indicator,item,Unit,Date,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [533]:
df = df.rename(columns={'state':'Country','Date':'Year'})

In [534]:
df

Unnamed: 0,Country,indicator,item,Unit,Year,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [535]:
df.duplicated().sum()

16

In [536]:
df = df.drop_duplicates()


In [537]:
df

Unnamed: 0,Country,indicator,item,Unit,Year,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [538]:
df

Unnamed: 0,Country,indicator,item,Unit,Year,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
563,Nigeria,Separated,70 - 74,Persons,2006,20988.0
564,Nigeria,Separated,75 - 79,Persons,2006,8215.0
565,Nigeria,Separated,80 - 84,Persons,2006,12813.0
566,Nigeria,Separated,85+,Persons,2006,11483.0


In [539]:
df.columns

Index(['Country', 'indicator', 'item', 'Unit', 'Year', 'Value'], dtype='object')

In [540]:
df = df.reset_index()
df = df.drop(columns=['index'])

In [541]:
df.duplicated().sum()

0

In [556]:
df

Unnamed: 0,Country,indicator,item,Unit,Year,Value
0,Nigeria,Population of Nigeria,total,Percentage,1991,100.0
1,Nigeria,Population of Nigeria,total,Percentage,2006,100.0
2,Nigeria,Population of Nigeria,total,Persons,1991,88992201.0
3,Nigeria,Population of Nigeria,total,Persons,2006,140431790.0
4,Nigeria,Population of Nigeria,total,Persons,2007,144998281.0
...,...,...,...,...,...,...
547,Nigeria,Separated,70 - 74,Persons,2006,20988.0
548,Nigeria,Separated,75 - 79,Persons,2006,8215.0
549,Nigeria,Separated,80 - 84,Persons,2006,12813.0
550,Nigeria,Separated,85+,Persons,2006,11483.0
