# PANDAS PROJECT - October 24, 2018

## First step, Import the necessary modules

In [None]:
import pandas as pd
import numpy as np
import os
print("Have to install xlrd (with a version equal or higher than 0.9.0)")
print(" >> xlrd module succesfully imported")
print("")
import xlrd
print("To see xlrd version we have to import pkg_resources")
import pkg_resources
print(" >> pkg_resources module succesfully imported")
print("")
print("xlrd version >> "+str(pkg_resources.get_distribution("xlrd").version))

## Database Main Function is to store the evolution of human population for a range of years
## In this case, it begins on 3700 BD and finish on 2000 AD
The Data Base is downloaded from the next link:<br>
[Nasa_population_register](http://sedac.ciesin.columbia.edu/data/set/urbanspatial-hist-urban-pop-3700bc-ad2000)

#### Function to list all files inside a folder tree for a specific file format

In [None]:
def list_files(startpath,form):
    gd=[]
    for root, dirs, files in os.walk(startpath):
        for f in files:
            fn,ext=os.path.splitext(f)
            if ext.lower()==form:
                a=root+"/"+f
                a=a.replace('\\\\','/')
                gd.append(a)
    return gd

## Listing files

In [None]:
gd=list_files(os.getcwd(),'.xlsx')

In [None]:
counter_ind=-1
for item in gd:
    counter_ind+=1
    print("Index "+str(counter_ind)+" >> "+item)

## Reading the Database (Excel File) for the sheet named as "Historical Urban Population"

In [None]:
excel_file=pd.read_excel(gd[0],sheet_name='Historical Urban Population')

In [None]:
print("Displaying the 10 first registers of the Database")
display(excel_file.head(10))

## Checking basic statistical info and info type

In [None]:
info=excel_file.describe(include='all').T
info['Data Type']=excel_file.dtypes
info=info
display(info)

## Evolution of human population from 3700 BD to 2000 AD

In [None]:
# Spanish Population evolution
pop_spain=excel_file[excel_file['Country']=='Spain'].reset_index(drop=True)

In [None]:
# Which is the oldest country?
# First, lets find the oldest date in the data base
old_country=excel_file.loc[excel_file['year']==excel_file['year'].min()]['Country'][0]
print("")
print(chr(27)+"[1;31m"+"The oldest country registred in the data base >> "+chr(27)+"[1;34m"+old_country+chr(27)+"[0m"+"")
print("")

In [None]:
# Spanish Population evolution
pop_oldest=excel_file[excel_file['Country']==old_country].reset_index(drop=True)

#### Lets check the basics of both data frames

In [None]:
info2=pop_spain.describe(include='all').T
print("")
print(chr(27)+"[1;31m"+">>  Spain Info  <<"+chr(27)+"[0m"+"")
print("")
display(info2)

In [None]:
info3=pop_oldest.describe(include='all').T
print("")
print(chr(27)+"[1;31m"+">>  "+old_country.capitalize()+"  <<"+chr(27)+"[0m"+"")
print("")
display(info3)

#### Following function returns year interval and population interval for each city of a country

In [None]:
def dif_pop(df,name_city):
    df_year_pop=df[df['City']==name_city][['year','pop']].sort_values('year',ascending=True).reset_index(drop=True)
    year=df_year_pop['year'].values
    pop=df_year_pop['pop'].values
    if np.size(df_year_pop,0)>1:
        year_interval=year[-1]-year[0]
        return year[0],year[1],pop[0],pop[1]
    else:
        return year[0],year[0],pop[0],pop[0]

#### Spain Cities population evolution evaluation

In [None]:
year_pop_spain=[]
for city_name in pop_spain['City'].unique().tolist():
    y0,y1,p0,p1=dif_pop(pop_spain,city_name)
    year_pop_spain.append([city_name,y0,y1,p0,p1])
    
df_spain=pd.DataFrame(year_pop_spain,columns=['City','from Year','to Year','from Population','to Population'])
df_spain['Population Diff']=df_spain['to Population']-df_spain['from Population']

In [None]:
label=['Decrease','Stable','Increase']

## Defining bins width

margin=[(df_spain['to Population']-df_spain['from Population']).min()*1.1,-10000,10000,
        (df_spain['to Population']-df_spain['from Population']).max()*1.1]

## Decrease will be one with a Population Diff less than -10000 people
## Increase will be one with a Population Diff more than 10000 people
## Stable will be one with a Population Diff between -10000 and 10000 people

In [None]:
bins = pd.cut(df_spain['Population Diff'],margin, labels=label)
final_spain=df_spain
final_spain['Pop Eval']=bins
display(final_spain.head(5))

In [None]:
spanish_deviation=final_spain['Pop Eval'].value_counts().sort_values()
print("-----------------------------------------------------------------------------")
print("")
print("  >> Summary of Population variation for spanish cities")
display(spanish_deviation)

### Iraq population evolution evaluation (Oldest country available)

In [None]:
year_pop_oldest=[]
for city_name in pop_oldest['City'].unique().tolist():
    y0,y1,p0,p1=dif_pop(pop_oldest,city_name)
    year_pop_oldest.append([city_name,y0,y1,p0,p1])
    
df_oldest=pd.DataFrame(year_pop_oldest,columns=['City','from Year','to Year','from Population','to Population'])
df_oldest['Population Diff']=df_oldest['to Population']-df_oldest['from Population']

In [None]:
label=['Decrease','Stable','Increase']

## Defining bins width

margin=[(df_oldest['to Population']-df_oldest['from Population']).min()*1.1,-10000,10000,
        (df_oldest['to Population']-df_oldest['from Population']).max()*1.1]

## Decrease will be one with a Population Diff less than -10000 people
## Increase will be one with a Population Diff more than 10000 people
## Stable will be one with a Population Diff between -10000 and 10000 people


In [None]:
bins = pd.cut(df_oldest['Population Diff'],margin, labels=label)
final_oldest=df_oldest
final_oldest['Pop Eval']=bins
display(final_oldest.head(5))

In [None]:
oldest_deviation=final_oldest['Pop Eval'].value_counts().sort_values()
print("-----------------------------------------------------------------------------")
print("")
print("  >> Summary of Population variation for spanish cities")
display(oldest_deviation)

### Concatenating Population summary for both countries

In [None]:
Summ_pop=pd.DataFrame(np.c_[np.r_[spanish_deviation,final_spain['Population Diff'].sum()],
                            np.r_[oldest_deviation,final_oldest['Population Diff'].sum()]])
Summ_pop.columns=['Spain','Iraq']
Summ_pop.index=['Decrease','Increase','Stable','Total Pop variation']
display(Summ_pop)

# Final Summary

The table below shows us that Spain, Even having more cities in expansion than Iraq from 3700BD to 2000AD,<br>
Iraq has a higher expansion in total population than Spain.<br>
As conclusion, For a deeper study of population evolution we should have an accurate data base with more continued<br>
registers. For each year there are only a couple of registers for each city that make our job more harder than usual<br> 
to reach an overall evaluation of wolrd population with high accuracy.<br>


### Global evaluation (Not totally finished, have to check some values)

In [None]:
%%time

counter=0
for country in excel_file['Country'].unique().tolist():
    counter+=1
    pop_country=excel_file[excel_file['Country']==country]
    year_pop=[]
    for city_name in pop_country['City'].unique().tolist():
        y0,y1,p0,p1=dif_pop(pop_country,city_name)
        year_pop.append([country,city_name,y0,y1,p0,p1])
        
    
    if counter==1:
        df_year_pop=pd.DataFrame(year_pop,columns=['Country','City','from Year','to Year','from Population','to Population'])
        df_final=df_year_pop
    else:
        df_year_pop=pd.DataFrame(year_pop,columns=['Country','City','from Year','to Year','from Population','to Population'])
        df_final=pd.concat((df_final,df_year_pop),axis=0).reset_index(drop=True)

df_final['Population Diff']=df_final['to Population']-df_final['from Population']
label=['Decrease','Stable','Increase']

## Defining bins width

margin=[(df_final['to Population']-df_final['from Population']).min()*1.1,-10000,10000,
        (df_final['to Population']-df_final['from Population']).max()*1.1]

## Decrease will be one with a Population Diff less than -10000 people
## Increase will be one with a Population Diff more than 10000 people
## Stable will be one with a Population Diff between -10000 and 10000 people

bins = pd.cut(df_final['Population Diff'],margin, labels=label)
final_final=df_final
final_final['Pop Eval']=bins
    
print("")
print("  >>  Displaying 20 first registers of our final data frame for all countries")
display(final_final.head(5))

In [None]:
print("  >> Displaying the basics statistics for the table above")
display(final_final.describe())

In [None]:
final_deviation=final_final['Pop Eval'].value_counts().sort_values()
Summ_pop_final=pd.DataFrame(np.r_[final_deviation,final_final['from Population'].sum(),
                                  final_final['to Population'].sum(),final_final['Population Diff'].sum()])
Summ_pop_final.columns=['Global']
Summ_pop_final.index=['Decrease','Increase','Stable','Initial Pop','Final Pop','Total Pop variance']

print("  >> Final Summary")
display(Summ_pop_final)

______