In [4]:
import os
import pandas as pd
import math
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import geopandas as gpd
import gdal
import geopandas as gpd
import descartes 
from shapely.geometry import Point, Polygon
from shapely import wkt


import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import fiona

#os.chdir('../../')
os.chdir('../')
os.getcwd()

'C:\\Users\\lgorman\\OneDrive\\Desktop\\PhD\\Analysis'

In [None]:
#Reading in all files and ensuring they are in a GeoDataFrame format
def create_geo_data_frame_from_point_dataset (dataframe, crs, name_of_lat_column, name_of_lon_column):
    latitude_not_null=dataframe.loc[:,name_of_lat_column].notnull()
    longitude_not_null=dataframe.loc[:,name_of_lon_column].notnull()
    dataframe=dataframe.loc[latitude_not_null&longitude_not_null,:]
    
    geometry=[Point(xy) for xy in zip(dataframe[name_of_lon_column], dataframe[name_of_lat_column])]
    dataframe = gpd.GeoDataFrame(dataframe, geometry=geometry,crs=crs)
    return dataframe;

# defining a coordinate reference system 
crs={'init' :'epsg:4326'}


points_rhomis_geodataframe=pd.read_csv(os.path.join("data","rhomis_data","RHoMIS_Indicators.csv"), encoding="latin1")
name_of_lat_column="GPS_LAT"
name_of_lon_column="GPS_LON"
points_rhomis_geodataframe=create_geo_data_frame_from_point_dataset(dataframe=points_rhomis_geodataframe,
                                                                   crs=crs,
                                                                   name_of_lat_column=name_of_lat_column,
                                                                   name_of_lon_column=name_of_lon_column)

grids_geodataframe=pd.read_csv(os.path.join("data","processed","hundred_km_grid_with_raster_info.csv"))
grids_geodataframe['geometry'] = grids_geodataframe['geometry'].apply(wkt.loads)
grids_geodataframe = gpd.GeoDataFrame(grids_geodataframe, geometry='geometry')
grids_geodataframe=gpd.GeoDataFrame(grids_geodataframe, geometry=grids_geodataframe["geometry"], crs=crs)

world_shapefile=gpd.read_file(os.path.join('data', 'shapefiles', 'World'))

In [None]:
grids_geodataframe=grids_geodataframe.drop_duplicates(subset=["geometry"])
world_shapefile=world_shapefile.drop_duplicates(subset=["geometry"])

In [None]:
def nesting_points_within_grids_within_countries(point_dataframe,point_ID,grid_dataframe,grid_ID,world_dataframe,country_ID):
    
    grid_overlay=gpd.overlay(world_dataframe,grid_dataframe, "identity")
    grid_overlay=pd.merge(grid_overlay, world_dataframe, how="left",on=country_ID, suffixes=["_grid","_country"])
        
    points_overlay=gpd.sjoin(grid_overlay.set_geometry("geometry_grid"),point_dataframe,op="contains")
    cols_to_merge = point_dataframe.columns.difference(points_overlay.columns).tolist()
    cols_to_merge.append(point_ID)    
    points_within_grid_within_country=pd.merge(points_overlay, point_dataframe[cols_to_merge], how="inner",on=point_ID, suffixes=["_grid","_point"])
    points_within_grid_within_country.columns=["geometry_point" if column_names=="geometry" else column_names for column_names in points_within_grid_within_country.columns]
    return points_within_grid_within_country;



In [None]:
points_within_grids_within_countries=nesting_points_within_grids_within_countries(point_dataframe=points_rhomis_geodataframe,
                                            point_ID="ID_HH",
                                            grid_dataframe=grids_geodataframe,
                                            grid_ID="FID",
                                            world_dataframe=world_shapefile,
                                            country_ID="CNTRY_NAME")


points_within_grids_within_countries_single_geometry=points_within_grids_within_countries.drop(columns=["geometry_country","geometry_point"])
points_within_grids_within_countries_single_geometry.to_file(os.path.join('data',"processed","points_within_grids_within_countries_100km.geojson"), driver='GeoJSON') # at this point we only preserve the grid geometry. Will need to add point geometry when reading the file back in again (same with country boundaries)  

In [None]:
world_shapefile=gpd.read_file(os.path.join('data', 'shapefiles', 'World'))

points_within_grids_within_countries=gpd.read_file(os.path.join('data',"processed","points_within_grids_within_countries_100km.geojson"), driver='GeoJSON')
points_within_grids_within_countries.columns=["geometry_grid" if column_names=="geometry" else column_names for column_names in points_within_grids_within_countries.columns]
point_geometry=[Point(xy) for xy in zip(points_within_grids_within_countries["GPS_LON"], points_within_grids_within_countries["GPS_LAT"])]
points_within_grids_within_countries["geometry_point"]=point_geometry




In [None]:
#merging country geometries with nested geo_file
cols_to_merge = ["CNTRY_NAME","geometry"]
points_within_grids_within_countries=pd.merge(points_within_grids_within_countries, world_shapefile[cols_to_merge], how="inner",on="CNTRY_NAME", suffixes=["",""])
points_within_grids_within_countries.columns=["geometry_country" if column_names=="geometry" else column_names for column_names in points_within_grids_within_countries.columns]



In [None]:
fig, ax= plt.subplots()

world_shapefile.plot(ax=ax,edgecolor="black")
points_within_grids_within_countries.set_geometry("geometry_country").plot(ax=ax, color="blue",alpha=0.5)
points_within_grids_within_countries.set_geometry("geometry_grid").plot(ax=ax, color="green",alpha=0.5)
points_within_grids_within_countries.set_geometry("geometry_point").plot(ax=ax, color="black")
plt.show()

In [None]:
# finding countries in the continent
continent="Africa"
world_shapefile=gpd.read_file(os.path.join('data', 'shapefiles', 'World'))
world_geopanda = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) #continents dataframe
continent_shapefile=world_geopanda.loc[world_geopanda["continent"]==continent,["continent","geometry"]]
continent_shapefile["geometry"]=continent_shapefile.envelope
continent_shapefile.columns=["geometry_continent" if col_name=="geometry" else col_name for col_name in continent_shapefile.columns]
#subset_world_shapefile=gpd.overlay(world_shapefile.set_geometry("geometry"), continent_shapefile.set_geometry("geometry_continent"), how='intersection')
subset_world_shapefile=gpd.sjoin(world_shapefile.set_geometry("geometry"), continent_shapefile.set_geometry("geometry_continent"), how='inner')

countries=subset_world_shapefile["CNTRY_NAME"].unique()


# Getting rid of some european countries
countries_to_exclude=["Portugal", "Spain", "Gibraltar", "Gaza Strip", "Israel", "Jordan", "West Bank", "Saudi Arabia", "Yemen"]
final_countries=pd.Series(countries).isin(countries_to_exclude)==False
final_countries=countries[final_countries]


#subsetting final dataframe
subset_world_shapefile=subset_world_shapefile.loc[subset_world_shapefile["CNTRY_NAME"].isin(final_countries),:]
subset_df=points_within_grids_within_countries["CNTRY_NAME"].isin(final_countries)
points_within_grids_within_countries=points_within_grids_within_countries.loc[subset_df,:]

In [None]:
fig, ax= plt.subplots()

subset_world_shapefile.plot(ax=ax,edgecolor="black")
points_within_grids_within_countries.set_geometry("geometry_country").plot(ax=ax, edgecolor="black",color="blue",alpha=0.2)
points_within_grids_within_countries.set_geometry("geometry_grid").plot(ax=ax, color="green",alpha=0.5)
points_within_grids_within_countries.set_geometry("geometry_point").plot(ax=ax, color="black")
plt.show()

In [None]:

x=pd.DataFrame(points_within_grids_within_countries.FID.value_counts())
sns.distplot(x["FID"])


#xmedian=pd.DataFrame(points_within_grids_within_countries.median_AEZ.value_counts())
#xmean=pd.DataFrame(points_within_grids_within_countries.mean_AEZ.value_counts())

#sns.distplot(xmedian["median_AEZ"], bins=40)
#points_within_grids_within_countries_single_geometry=points_within_grids_within_countries.drop(columns=["geometry_country","geometry_point"])
#points_within_grids_within_countries_single_geometry.set_geometry("geometry_grid").to_file(os.path.join('data',"processed","points_within_grids_within_countries_100km.geojson"), driver='GeoJSON') # at this point we only preserve the grid geometry. Will need to add point geometry when reading the file back in again (same with country boundaries)  


In [None]:
def remove_outliers_with_interquartile_range(column):
    quartile_1=column.quantile(0.25)
    quartile_3=column.quantile(0.75)
    interquartile_range=quartile_3-quartile_1
    
    lower_limit=quartile_1-1.5*interquartile_range
    upper_limit=quartile_3+1.5*interquartile_range
    
    column=column[(column>lower_limit) & (column<upper_limit)]
    return column;


len(points_within_grids_within_countries["total_income_USD_PPP_pHH_Yr"])

In [None]:
points_within_grids_within_countries.describe()

In [None]:
points_within_grids_within_countries["mean_education"].dtype

In [None]:
sns.distplot(remove_outliers_with_interquartile_range(points_within_grids_within_countries["median_population_density"].dropna()))

In [None]:
ymax

In [None]:
countries_in_africa=["Algeria",
                     "Angola",
                     "Benin",
                     "Botswana",
                    "Burundi",
                    "Burkina Faso",
                    "Zaire",
                     "Chad",
                    "Comoros",
                    "Cameroon",
                    "Central African Republic",
                    "Congo",
                    "Djibouti",
                    "Egypt",
                    "Equitorial Guinea",
                    "Eritrea",
                    "Ethiopia",
                    "Gambia, The",
                    "Gabon",
                    "Guinea",
                    "Kenya"
                    ]