In [None]:
# ONS (Census 2011) statistics for Lewisham.
# Fetches Nomis data releases and prepares them for use in Lewisham ward profiles.
#
# This requires the OA->LSOA lookup table produced by "lbl_boundaries_oa.ipynb"
# as well as the OA->WD22 lookup table produced by "lbl_boundaries_2022_wards.ipynb"

In [1]:
import pandas as pd

from google.colab import drive
import google.colab.files as files

import IPython

Tools
==
Helpers to reformat & aggregate the Nomis source data.

In [2]:
# Extract the subset we need.
def filter_nomis_data(d, date_values, geography_values):
  return d[d.date.isin(date_values) & 
           d.geography.isin(geography_values)]

# Reformat Nomis data into a lookup table format:
# clear labelling of the index (geography) column, 
# remove superfluous columns.
def format_nomis_data(d, geo_colname):
  return d.\
           rename(columns={'geography code': geo_colname}).\
           drop(columns=['date', 'geography'])

# Convenience function to call the above.
def nomis_to_oa(d, years, oa_list):
  return format_nomis_data(filter_nomis_data(d, years, oa_list), geo_colname='OA11CD')

In [3]:
# Aggregate OA-level data to LSOAs, Wards, etc. (Calculates the sum.)
def aggregate_oa_groups(oa_data, oa_group_table, oa_col, group_col):
  return pd.merge(oa_group_table, oa_data, on=oa_col, how='right').\
              drop(columns=[oa_col]).\
              groupby(group_col).sum()

Data
==
GDrive mount
--

In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Used as root folder.
project_dir = '/content/gdrive/MyDrive/WardProfiles'
oa_lookups_dir = f"{project_dir}/lookups/oa"
ward_lookups_dir = f"{project_dir}/lookups/2022_wards"

# For exports
output_dir = f"{project_dir}/languages"

In [None]:
!mkdir -p '{output_dir}'

Downloads
--

In [None]:
# Main language (detailed), OAs in London 2011
# Source: 
# https://www.nomisweb.co.uk/census/2011/QS204EW
!wget 'https://www.nomisweb.co.uk/api/v01/dataset/nm_525_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2013265927TYPE299' \
  -O "{output_dir}/census11_main_language_detailed_oa11_london.csv"

--2021-11-15 14:12:21--  https://www.nomisweb.co.uk/api/v01/dataset/nm_525_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2013265927TYPE299
Resolving www.nomisweb.co.uk (www.nomisweb.co.uk)... 129.234.253.212
Connecting to www.nomisweb.co.uk (www.nomisweb.co.uk)|129.234.253.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6368597 (6.1M) [text/csv]
Saving to: ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_main_language_detailed_oa11_london.csv’


2021-11-15 14:13:00 (440 KB/s) - ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_main_language_detailed_oa11_london.csv’ saved [6368597/6368597]



In [None]:
# Proficiency in English, OAs in London 2011
# Source: 
# https://www.nomisweb.co.uk/census/2011/QS205EW
!wget 'https://www.nomisweb.co.uk/api/v01/dataset/nm_526_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2013265927TYPE299' \
  -O "{output_dir}/census11_english_proficiency_oa11_london.csv"

--2021-11-15 14:13:00--  https://www.nomisweb.co.uk/api/v01/dataset/nm_526_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2013265927TYPE299
Resolving www.nomisweb.co.uk (www.nomisweb.co.uk)... 129.234.253.212
Connecting to www.nomisweb.co.uk (www.nomisweb.co.uk)|129.234.253.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1430376 (1.4M) [text/csv]
Saving to: ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_english_proficiency_oa11_london.csv’


2021-11-15 14:13:16 (424 KB/s) - ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_english_proficiency_oa11_london.csv’ saved [1430376/1430376]



In [None]:
# For certain variables: get reference data for other geographies.
#
# Nomis geography codes are derived from their GEOGRAPHY and GEOGRAPHY_TYPECODE 
# fields, both numeric. For our purposes they are as follows:
# England: 2092957699TYPE499
# London: 2013265927TYPE480
# Lewisham: 1946157254TYPE464

# And optionally:
# United Kingdom: 2092957697TYPE499 -- this also returns nation-level records
# Great Britain: 2092957698TYPE499
# England and Wales: 2092957703TYPE499

# NOTE that not all variables are available at all levels of aggregation.

In [None]:
# Proficiency in English, OAs in London 2011
# Source: 
# https://www.nomisweb.co.uk/census/2011/QS205EW
!wget 'https://www.nomisweb.co.uk/api/v01/dataset/nm_526_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2092957699TYPE499,2013265927TYPE480,1946157254TYPE464' \
  -O "{output_dir}/census11_english_proficiency_references.csv"

--2021-11-16 13:45:56--  https://www.nomisweb.co.uk/api/v01/dataset/nm_526_1.bulk.csv?time=latest&measures=20100&rural_urban=total&geography=2092957699TYPE499,2013265927TYPE480,1946157254TYPE464
Resolving www.nomisweb.co.uk (www.nomisweb.co.uk)... 129.234.253.212
Connecting to www.nomisweb.co.uk (www.nomisweb.co.uk)|129.234.253.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 968 [text/csv]
Saving to: ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_english_proficiency_references.csv’


2021-11-16 13:45:58 (160 MB/s) - ‘/content/gdrive/MyDrive/WardProfiles/languages/census11_english_proficiency_references.csv’ saved [968/968]



Lookups
--
Used to match up OAs with their LSOAs, and 2022 electoral wards

In [6]:
# All OAs and their LSOAs in Lewisham
oa_lsoa_join = pd.read_csv(f"{oa_lookups_dir}/lbl_oa11_lsoa11_msoa11_lad20_rgn20_202012.csv")
oa_lsoa_join = oa_lsoa_join[['OA11CD', 'LSOA11CD']]
oa_lsoa_join.head()

Unnamed: 0,OA11CD,LSOA11CD
0,E00016277,E01003220
1,E00016278,E01003220
2,E00016285,E01003220
3,E00016257,E01003221
4,E00016263,E01003221


In [7]:
# All OAs and their 2022 wards in Lewisham
oa_wd22_join = pd.read_csv(f"{ward_lookups_dir}/lbl_oa11_wd22_proposed.csv")
oa_wd22_join = oa_wd22_join[['OA11CD', 'WD22CD_proposed']]
oa_wd22_join.head()

Unnamed: 0,OA11CD,WD22CD_proposed
0,E00016403,E05013721
1,E00016442,E05013721
2,E00016407,E05013721
3,E00016404,E05013721
4,E00016402,E05013721


Process & export
==
Lewisham
--

In [8]:
# For filtering of source data
lbl_oa_list = oa_lsoa_join.OA11CD.unique() # All OAs in Lewisham
years = [2011] # Just a precaution, in case any of the downloads include multiple periods

for datname in ['main_language_detailed', 'english_proficiency']:
  IPython.display.display(f"=== {datname} ===")
  # Load the Nomis data
  d = pd.read_csv(f"{output_dir}/census11_{datname}_oa11_london.csv")

  # Simplify the column names
  d.columns = d.columns.str.replace(
      # Remove the redundant title prefix and qualifier suffix
      r'^.*?: (.*?); measures: Value', 
      r'\1')
  d.columns = d.columns.str.replace(
      # Remove a redundant qualifier
      ' \(English or Welsh (if )?in Wales\)', '')
  d = d.rename(columns={
      'All usual residents aged 3 and over': 'Total',
      'All categories: Proficiency in English': 'Total'
  })

  # Transform to OA index data
  lbl_oa = nomis_to_oa(d, years, lbl_oa_list)
  lbl_oa = lbl_oa.drop(columns='Rural Urban') # Unused
  lbl_oa.to_csv(f"{output_dir}/lbl_{datname}_oa11.csv", index=False)

  # Aggregate to LSOA level
  lbl_lsoa = aggregate_oa_groups(lbl_oa, oa_lsoa_join, oa_col='OA11CD', group_col='LSOA11CD')
  lbl_lsoa.to_csv(f"{output_dir}/lbl_{datname}_lsoa11.csv", index=True)
  IPython.display.display(lbl_lsoa.head())

  # Aggregate to WD22 level
  lbl_wd22 = aggregate_oa_groups(lbl_oa, oa_wd22_join, oa_col='OA11CD', group_col='WD22CD_proposed')
  lbl_wd22.to_csv(f"{output_dir}/lbl_{datname}_wd22.csv", index=True)
  IPython.display.display(lbl_wd22.head())

'=== main_language_detailed ==='

Unnamed: 0_level_0,Total,English,Welsh/Cymraeg (in England only),Other UK language: Total,Other UK language: Gaelic (Irish),Other UK language: Gaelic (Scottish),Other UK language: Manx Gaelic,Other UK language: Gaelic (Not otherwise specified),Other UK language: Cornish,Other UK language: Scots,Other UK language: Gypsy/Traveller languages,French,Portuguese,Spanish,Other European Language (EU): Total,Other European Language (EU): Italian,Other European Language (EU): German,Other European Language (EU): Polish,Other European Language (EU): Slovak,Other European Language (EU): Czech,Other European Language (EU): Romanian,Other European Language (EU): Lithuanian,Other European Language (EU): Latvian,Other European Language (EU): Hungarian,Other European Language (EU): Bulgarian,Other European Language (EU): Greek,Other European Language (EU): Dutch,Other European Language (EU): Swedish,Other European Language (EU): Danish,Other European Language (EU): Finnish,Other European Language (EU): Estonian,Other European Language (EU): Slovenian,Other European Language (EU): Maltese,Other European Language (EU): Any other European Language (EU),Other European Language (non EU): Total,Other European Language (non EU): Albanian,Other European Language (non EU): Serbian/Croatian/Bosnian,Other European Language (non EU): Ukrainian,Other European Language (non EU): Any other Eastern European Language (non EU),Other European Language (non EU): Northern European Language (non EU),...,South Asian Language: Nepalese,South Asian Language: South Asian Language (all other),East Asian Language: Total,East Asian Language: Mandarin Chinese,East Asian Language: Cantonese Chinese,East Asian Language: All other Chinese,East Asian Language: Japanese,East Asian Language: Korean,East Asian Language: Vietnamese,East Asian Language: Thai,East Asian Language: Malay,East Asian Language: Tagalog/Filipino,East Asian Language: East Asian Language (all other),Oceanic/Australian language (any),North/South American language (any),Caribbean Creole: Total,Caribbean Creole: Caribbean Creole (English-based),Caribbean Creole: Caribbean Creole (all other),African Language: Total,African Language: Amharic,African Language: Tigrinya,African Language: Somali,African Language: Krio,African Language: Akan,African Language: Yoruba,African Language: Igbo,African Language: Swahili/Kiswahili,African Language: Luganda,African Language: Lingala,African Language: Shona,African Language: Afrikaans,African Language: Any other Nigerian language,African Language: West African language (all other),African Language: African language (all other),Other Languages: Total,Other Languages: All other languages,Sign Language: Total,Sign Language: British sign language,Sign Language: Sign Language (all other),Sign Language: Any Sign Communication System
LSOA11CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
E01003189,1593,1379,0,0,0,0,0,0,0,0,0,18,6,8,35,3,0,5,0,1,2,13,0,2,0,3,6,0,0,0,0,0,0,0,5,3,0,1,1,0,...,0,1,15,2,0,9,0,0,4,0,0,0,0,0,0,0,0,0,27,0,0,7,0,3,3,5,0,0,0,1,0,2,5,1,0,0,0,0,0,0
E01003190,1734,1495,0,0,0,0,0,0,0,0,0,39,12,7,40,11,1,5,0,0,3,5,2,5,2,1,2,1,1,0,1,0,0,0,12,9,3,0,0,0,...,0,0,9,2,1,1,1,0,2,1,0,1,0,0,0,0,0,0,18,1,3,1,3,0,1,2,2,1,0,0,0,0,0,4,1,1,1,0,0,1
E01003191,1390,1235,0,0,0,0,0,0,0,0,0,17,8,5,38,1,0,19,0,0,0,13,0,0,0,1,2,1,0,1,0,0,0,0,9,9,0,0,0,0,...,0,0,12,0,2,7,0,0,3,0,0,0,0,0,0,0,0,0,26,4,0,7,0,4,2,3,1,2,0,0,0,2,1,0,5,5,0,0,0,0
E01003192,1550,1312,0,1,1,0,0,0,0,0,0,27,12,3,43,9,1,13,1,1,0,5,0,6,2,2,1,1,0,0,1,0,0,0,12,12,0,0,0,0,...,2,0,22,0,4,12,0,0,1,0,0,5,0,0,0,0,0,0,34,0,1,4,0,10,1,1,0,4,1,3,0,4,0,5,1,1,6,6,0,0
E01003193,1506,1362,0,0,0,0,0,0,0,0,0,16,2,11,41,5,0,27,2,1,0,0,1,0,2,1,1,1,0,0,0,0,0,0,9,9,0,0,0,0,...,0,2,21,0,4,3,0,1,3,0,0,9,1,0,0,0,0,0,5,0,0,0,0,0,2,2,0,0,0,0,1,0,0,0,0,0,3,2,0,1


Unnamed: 0_level_0,Total,English,Welsh/Cymraeg (in England only),Other UK language: Total,Other UK language: Gaelic (Irish),Other UK language: Gaelic (Scottish),Other UK language: Manx Gaelic,Other UK language: Gaelic (Not otherwise specified),Other UK language: Cornish,Other UK language: Scots,Other UK language: Gypsy/Traveller languages,French,Portuguese,Spanish,Other European Language (EU): Total,Other European Language (EU): Italian,Other European Language (EU): German,Other European Language (EU): Polish,Other European Language (EU): Slovak,Other European Language (EU): Czech,Other European Language (EU): Romanian,Other European Language (EU): Lithuanian,Other European Language (EU): Latvian,Other European Language (EU): Hungarian,Other European Language (EU): Bulgarian,Other European Language (EU): Greek,Other European Language (EU): Dutch,Other European Language (EU): Swedish,Other European Language (EU): Danish,Other European Language (EU): Finnish,Other European Language (EU): Estonian,Other European Language (EU): Slovenian,Other European Language (EU): Maltese,Other European Language (EU): Any other European Language (EU),Other European Language (non EU): Total,Other European Language (non EU): Albanian,Other European Language (non EU): Serbian/Croatian/Bosnian,Other European Language (non EU): Ukrainian,Other European Language (non EU): Any other Eastern European Language (non EU),Other European Language (non EU): Northern European Language (non EU),...,South Asian Language: Nepalese,South Asian Language: South Asian Language (all other),East Asian Language: Total,East Asian Language: Mandarin Chinese,East Asian Language: Cantonese Chinese,East Asian Language: All other Chinese,East Asian Language: Japanese,East Asian Language: Korean,East Asian Language: Vietnamese,East Asian Language: Thai,East Asian Language: Malay,East Asian Language: Tagalog/Filipino,East Asian Language: East Asian Language (all other),Oceanic/Australian language (any),North/South American language (any),Caribbean Creole: Total,Caribbean Creole: Caribbean Creole (English-based),Caribbean Creole: Caribbean Creole (all other),African Language: Total,African Language: Amharic,African Language: Tigrinya,African Language: Somali,African Language: Krio,African Language: Akan,African Language: Yoruba,African Language: Igbo,African Language: Swahili/Kiswahili,African Language: Luganda,African Language: Lingala,African Language: Shona,African Language: Afrikaans,African Language: Any other Nigerian language,African Language: West African language (all other),African Language: African language (all other),Other Languages: Total,Other Languages: All other languages,Sign Language: Total,Sign Language: British sign language,Sign Language: Sign Language (all other),Sign Language: Any Sign Communication System
WD22CD_proposed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
E05013714,10061,8760,0,1,1,0,0,0,0,0,0,134,51,37,250,34,6,72,1,3,11,49,2,16,16,14,16,3,1,2,2,0,2,0,51,45,4,1,1,0,...,13,1,98,4,14,39,1,0,22,7,2,8,1,0,0,0,0,0,149,6,4,29,4,22,22,18,3,8,1,4,0,8,8,12,9,9,9,8,0,1
E05013715,14787,12666,2,1,1,0,0,0,0,0,0,236,94,149,526,66,87,131,4,14,22,19,9,14,47,23,30,18,13,15,5,0,3,6,126,93,9,11,1,12,...,33,11,290,40,43,113,32,8,31,11,1,9,2,0,0,0,0,0,170,7,3,22,3,10,26,9,28,9,7,4,19,11,7,5,14,14,4,4,0,0
E05013716,15857,13153,4,7,6,0,0,0,0,1,0,323,157,240,795,131,131,207,14,13,21,42,6,52,31,35,30,35,14,20,4,3,1,5,66,24,12,8,0,22,...,6,8,367,36,44,123,68,22,49,9,11,3,2,0,0,4,2,2,201,6,10,37,0,24,44,21,6,1,0,6,8,13,15,10,16,16,10,10,0,0
E05013717,15437,13130,2,3,0,0,0,1,0,0,2,136,92,65,479,92,40,164,4,12,30,64,6,14,9,29,4,1,4,2,2,0,1,1,87,69,4,9,0,5,...,7,7,149,10,22,52,4,8,30,11,5,6,1,0,0,3,3,0,198,4,4,47,1,20,53,24,2,3,5,10,3,6,6,10,13,13,12,8,3,1
E05013718,14150,12216,4,7,6,0,0,1,0,0,0,163,71,145,688,120,69,289,9,15,43,33,1,23,16,22,18,16,5,3,4,0,1,1,41,26,5,3,0,7,...,2,1,167,37,12,33,29,0,24,19,2,10,1,1,0,4,4,0,123,2,4,14,3,19,19,13,7,3,3,10,13,7,1,5,10,10,12,9,0,3


'=== english_proficiency ==='

Unnamed: 0_level_0,Total,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
LSOA11CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E01003189,1593,1379,93,74,44,3
E01003190,1734,1495,106,92,35,6
E01003191,1390,1235,57,74,22,2
E01003192,1550,1312,99,93,42,4
E01003193,1506,1362,53,59,30,2


Unnamed: 0_level_0,Total,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
WD22CD_proposed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
E05013714,10061,8760,556,505,218,22
E05013715,14787,12666,1014,743,297,67
E05013716,15857,13153,1481,895,281,47
E05013717,15437,13130,955,865,436,51
E05013718,14150,12216,942,695,272,25


Reference geographies
--

In [None]:
# For filtering of source data
reference_names = ['United Kingdom', 'Great Britain', 'England and Wales',
                   'England', 'London', 'Lewisham']
years = [2011] # Just a precaution, in case any of the downloads include multiple periods

for datname in [#'main_language_detailed', 
                'english_proficiency']:
  IPython.display.display(f"=== {datname} ===")
  # Load the Nomis data
  d = pd.read_csv(f"{output_dir}/census11_{datname}_references.csv")

  # Simplify the column names
  d.columns = d.columns.str.replace(
      # Remove the redundant title prefix and qualifier suffix
      r'^.*?: (.*?); measures: Value', 
      r'\1')
  d.columns = d.columns.str.replace(
      # Remove a redundant qualifier
      ' \(English or Welsh (if )?in Wales\)', '')
  d = d.rename(columns={
      'All usual residents aged 3 and over': 'Total',
      'All categories: Proficiency in English': 'Total'
  })

  # Filter & transform to index data
  ref = filter_nomis_data(d, years, reference_names).drop(columns=['date', 'Rural Urban'])
  ref = ref.rename(columns={'geography': 'Name', 'geography code': 'Code'})
  ref.to_csv(f"{output_dir}/references_{datname}.csv", index=False)
  IPython.display.display(ref.head())

'=== english_proficiency ==='

Unnamed: 0,Name,Code,Total,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
0,England,E92000001,51005610,46936780,1689406,1535579,709862,133983
1,London,E12000007,7809942,6083420,763502,643410,271693,47917
2,Lewisham,E09000023,262256,219035,19744,15828,6611,1038


Prep: tools for language groupings
==
We want to identify a) the top n languages and b) the top n language regions. This requires us to first review the existing data so that we can label and aggregate it accordingly.

In [25]:
# The full list of main languages and their number of speakers
ml = pd.read_csv(f"{output_dir}/lbl_main_language_detailed_oa11.csv")
ml = ml.drop(columns=['OA11CD', 'Total']).sum()

Top language regions
--

In [26]:
# # The full list of language labels, for review
# ml.index.values

In [27]:
import re

# Takes a language label from the ONS 2011 census data set
# Returns a global region
def get_language_region(label):
  # First handle the special cases
  if label in ['English', 'Welsh/Cymraeg (in England only)']:
    return 'UK language'
  if label in ['French', 'Portuguese', 'Spanish']:
    return 'European Language (EU)'
  if label in ['Russian', 'Turkish', 'Arabic']:
    return 'West/Central Asian Language'
  if label.startswith('Other Languages:'):
    # This would otherwise be mangled by the regex below
    return 'Other Languages'
  # For the rest, extract the region label from the label
  return re.sub(
      r'^(Other )?(.*?): .*', 
      r'\2',
      label)

In [28]:
# Apply the regions mapping to the full list of languages
pd.Series(ml.index).map(get_language_region).value_counts()

European Language (EU)                 23
African Language                       16
South Asian Language                   14
East Asian Language                    11
UK language                            10
West/Central Asian Language             9
European Language (non EU)              6
Sign Language                           4
European Language (non-national)        3
Caribbean Creole                        3
Other Languages                         2
North/South American language (any)     1
Oceanic/Australian language (any)       1
dtype: int64

Top languages
--

In [29]:
# The data includes partial subtotals, we'll remove these from our language ranking
subtotal_columns = ml.index[ml.index.str.endswith(': Total')].values.tolist()
subtotal_columns

['Other UK language: Total',
 'Other European Language (EU): Total',
 'Other European Language (non EU): Total',
 'Other European Language (non-national): Total',
 'West/Central Asian Language: Total',
 'South Asian Language: Total',
 'East Asian Language: Total',
 'Caribbean Creole: Total',
 'African Language: Total',
 'Other Languages: Total',
 'Sign Language: Total']

In [30]:
# Remove any subtotals, then rank by frequency
language_ranking = ml.drop(index=subtotal_columns).sort_values(ascending=False)
language_ranking.head(30)

English                                                      219035
Other European Language (EU): Polish                           4088
French                                                         3929
South Asian Language: Tamil                                    3338
Spanish                                                        2477
Turkish                                                        2227
Portuguese                                                     1966
East Asian Language: All other Chinese                         1791
Other European Language (EU): Italian                          1543
Other European Language (EU): Lithuanian                       1179
Other European Language (non EU): Albanian                     1040
East Asian Language: Vietnamese                                 990
Arabic                                                          964
Other European Language (EU): German                            952
Russian                                         

In [31]:
# Let's choose an arbitrary cutoff point: any language spoken by more than 500 people
language_ranking[language_ranking>=500].index.values

array(['English', 'Other European Language (EU): Polish', 'French',
       'South Asian Language: Tamil', 'Spanish', 'Turkish', 'Portuguese',
       'East Asian Language: All other Chinese',
       'Other European Language (EU): Italian',
       'Other European Language (EU): Lithuanian',
       'Other European Language (non EU): Albanian',
       'East Asian Language: Vietnamese', 'Arabic',
       'Other European Language (EU): German', 'Russian',
       'African Language: Somali', 'African Language: Yoruba',
       'Other European Language (EU): Bulgarian',
       'West/Central Asian Language: Persian/Farsi',
       'East Asian Language: Cantonese Chinese',
       'Other European Language (EU): Romanian',
       'South Asian Language: Bengali (with Sylheti and Chatgaya)',
       'African Language: Akan', 'South Asian Language: Urdu'],
      dtype=object)

In [32]:
# From there we curate a selection of languages, plus some groupings.

import re

# Takes a language label from the ONS 2011 census data set
# Returns a language label according to our custom grouping of common languages
def get_common_languages_label(label):
  # Language list is taken from the selection above, excluding any entries that 
  # are subgroupings
  if label in [
      'English', 'Other European Language (EU): Polish', 'French',
      'South Asian Language: Tamil', 'Spanish', 'Turkish', 'Portuguese',
      # Exclude any entries that are subgroupings
      #'East Asian Language: All other Chinese',
      'Other European Language (EU): Italian',
      'Other European Language (EU): Lithuanian',
      'Other European Language (non EU): Albanian',
      'East Asian Language: Vietnamese', 'Arabic',
      'Other European Language (EU): German', 'Russian',
      'African Language: Somali', 'African Language: Yoruba',
      'Other European Language (EU): Bulgarian',
      'West/Central Asian Language: Persian/Farsi',
      'East Asian Language: Cantonese Chinese',
      'Other European Language (EU): Romanian',
      'South Asian Language: Bengali (with Sylheti and Chatgaya)',
      'African Language: Akan', 'South Asian Language: Urdu']:
      return re.sub(
          r'(^.*: )?(.*)',
          r'\2',
          label)
  # return 'Other ' + get_language_region(label)
  return 'Other Languages'

In [33]:
# Apply the label to the full list of languages
pd.Series(ml.index).map(get_common_languages_label).value_counts()

Other Languages                        80
Italian                                 1
Portuguese                              1
Urdu                                    1
English                                 1
Tamil                                   1
Russian                                 1
Persian/Farsi                           1
Somali                                  1
Lithuanian                              1
Yoruba                                  1
German                                  1
Turkish                                 1
Vietnamese                              1
French                                  1
Cantonese Chinese                       1
Romanian                                1
Arabic                                  1
Spanish                                 1
Polish                                  1
Akan                                    1
Albanian                                1
Bengali (with Sylheti and Chatgaya)     1
Bulgarian                         

Derivatives
==

Main language (%)
--

In [34]:
# Aggregates languages based on a custom group function.
# - d is a language data set
# - label_group_func takes an ONS language label (a column name) and returns a group label
# - geog_colname is the name of the geographical index column in d, e.g. 'OA11CD'
def get_language_groups(d, label_group_func, geog_colname):
  # Source language list -- we exclude any subtotals included in the data
  languages = d.columns[d.columns.str.endswith(': Total')==False]
  languages = [v for v in languages if v!=geog_colname]
  
  # Aggregate
  d_grouped = d[[geog_colname]].copy()
  for lang in languages:
    groupname = label_group_func(lang)
    if groupname not in d_grouped.columns:
      d_grouped[groupname] = 0
    d_grouped[groupname] += d[lang]
  
  return d_grouped

In [36]:
for geog, geog_colname in zip(['oa11', 'wd22'], ['OA11CD', 'WD22CD_proposed']):
  IPython.display.display(f"=== {geog} ===")

  ml = pd.read_csv(f"{output_dir}/lbl_main_language_detailed_{geog}.csv")

  # Relative measures
  d = ml[[geog_colname]].copy()
  var_colnames = ml.drop(columns=[geog_colname, 'Total']).columns
  for colname in var_colnames: 
    d[colname] = ml[colname] * 100. / ml.Total
  
  d.to_csv(f"{output_dir}/lbl_main_language_detailed_share_{geog}.csv", index=False)
  IPython.display.display(d.head())

  # Groupings
  d_regions = get_language_groups(d, get_language_region, geog_colname)
  d_regions.to_csv(f"{output_dir}/lbl_main_language_regions_share_{geog}.csv", index=False)
  IPython.display.display(d_regions.head())

  d_commonlangs = get_language_groups(d, get_common_languages_label, geog_colname)
  # Move this column to the end
  t = d_commonlangs.pop('Other Languages')
  d_commonlangs['Other Languages'] = t
  d_commonlangs.to_csv(f"{output_dir}/lbl_main_language_commonlangs_share_{geog}.csv", index=False)
  IPython.display.display(d_commonlangs.head())

'=== oa11 ==='

Unnamed: 0,OA11CD,English,Welsh/Cymraeg (in England only),Other UK language: Total,Other UK language: Gaelic (Irish),Other UK language: Gaelic (Scottish),Other UK language: Manx Gaelic,Other UK language: Gaelic (Not otherwise specified),Other UK language: Cornish,Other UK language: Scots,Other UK language: Gypsy/Traveller languages,French,Portuguese,Spanish,Other European Language (EU): Total,Other European Language (EU): Italian,Other European Language (EU): German,Other European Language (EU): Polish,Other European Language (EU): Slovak,Other European Language (EU): Czech,Other European Language (EU): Romanian,Other European Language (EU): Lithuanian,Other European Language (EU): Latvian,Other European Language (EU): Hungarian,Other European Language (EU): Bulgarian,Other European Language (EU): Greek,Other European Language (EU): Dutch,Other European Language (EU): Swedish,Other European Language (EU): Danish,Other European Language (EU): Finnish,Other European Language (EU): Estonian,Other European Language (EU): Slovenian,Other European Language (EU): Maltese,Other European Language (EU): Any other European Language (EU),Other European Language (non EU): Total,Other European Language (non EU): Albanian,Other European Language (non EU): Serbian/Croatian/Bosnian,Other European Language (non EU): Ukrainian,Other European Language (non EU): Any other Eastern European Language (non EU),Other European Language (non EU): Northern European Language (non EU),...,South Asian Language: Nepalese,South Asian Language: South Asian Language (all other),East Asian Language: Total,East Asian Language: Mandarin Chinese,East Asian Language: Cantonese Chinese,East Asian Language: All other Chinese,East Asian Language: Japanese,East Asian Language: Korean,East Asian Language: Vietnamese,East Asian Language: Thai,East Asian Language: Malay,East Asian Language: Tagalog/Filipino,East Asian Language: East Asian Language (all other),Oceanic/Australian language (any),North/South American language (any),Caribbean Creole: Total,Caribbean Creole: Caribbean Creole (English-based),Caribbean Creole: Caribbean Creole (all other),African Language: Total,African Language: Amharic,African Language: Tigrinya,African Language: Somali,African Language: Krio,African Language: Akan,African Language: Yoruba,African Language: Igbo,African Language: Swahili/Kiswahili,African Language: Luganda,African Language: Lingala,African Language: Shona,African Language: Afrikaans,African Language: Any other Nigerian language,African Language: West African language (all other),African Language: African language (all other),Other Languages: Total,Other Languages: All other languages,Sign Language: Total,Sign Language: British sign language,Sign Language: Sign Language (all other),Sign Language: Any Sign Communication System
0,E00016403,70.570571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.003003,2.702703,1.501502,6.006006,0.3003,0.0,5.405405,0.0,0.0,0.0,0.0,0.0,0.3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600601,0.0,0.0,0.600601,0.0,0.0,...,0.0,0.0,3.303303,0.0,0.0,1.801802,0.0,0.0,1.501502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.207207,0.0,0.0,1.801802,0.0,2.102102,0.900901,0.0,0.0,0.0,0.0,0.900901,0.0,0.0,1.201201,0.3003,0.3003,0.3003,0.0,0.0,0.0,0.0
1,E00016437,77.514793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.564103,0.0,1.775148,2.169625,0.197239,0.0,0.394477,0.197239,0.197239,0.0,0.986193,0.0,0.0,0.0,0.0,0.197239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.366864,2.366864,0.0,0.0,0.0,0.0,...,0.0,0.0,7.495069,0.0,0.394477,3.353057,0.0,0.0,3.747535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.761341,0.0,0.0,0.197239,0.0,0.591716,0.0,0.788955,0.0,0.0,0.394477,0.197239,0.0,0.0,0.0,0.591716,0.0,0.0,0.0,0.0,0.0,0.0
2,E00016439,74.566474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.069364,3.468208,2.023121,4.624277,0.867052,0.289017,0.0,0.0,1.445087,0.0,1.156069,0.0,0.0,0.289017,0.0,0.0,0.289017,0.0,0.0,0.289017,0.0,0.0,0.0,0.867052,0.867052,0.0,0.0,0.0,0.0,...,0.0,0.0,1.734104,0.0,0.0,0.578035,0.0,0.0,1.156069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.890173,0.0,0.0,1.156069,0.0,0.289017,0.289017,0.289017,0.578035,0.0,0.0,0.289017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,E00016442,78.527607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.680982,1.533742,1.840491,3.680982,1.226994,0.613497,1.226994,0.0,0.0,0.0,0.306748,0.0,0.0,0.306748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.294479,0.0,0.306748,0.613497,0.0,0.0,2.760736,0.0,0.613497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.374233,0.0,0.0,0.306748,0.306748,0.0,1.840491,0.613497,0.0,0.0,0.0,0.0,0.0,0.306748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,E00016399,80.630631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.675676,0.45045,1.801802,2.477477,0.900901,0.0,0.45045,0.0,0.0,0.225225,0.45045,0.0,0.0,0.0,0.45045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.351351,1.351351,0.0,0.0,0.0,0.0,...,0.0,0.225225,4.279279,0.0,0.0,1.576577,0.0,0.0,2.702703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.153153,0.0,0.0,1.351351,0.0,0.900901,0.225225,0.0,0.0,0.225225,0.0,0.0,0.0,0.0,0.45045,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,OA11CD,UK language,European Language (EU),European Language (non EU),European Language (non-national),West/Central Asian Language,South Asian Language,East Asian Language,Oceanic/Australian language (any),North/South American language (any),Caribbean Creole,African Language,Other Languages,Sign Language
0,E00016403,70.570571,13.213213,0.600601,0.0,1.501502,3.303303,3.303303,0.0,0.0,0.0,7.207207,0.3003,0.0
1,E00016437,77.514793,6.508876,2.366864,0.0,0.986193,2.366864,7.495069,0.0,0.0,0.0,2.761341,0.0,0.0
2,E00016439,74.566474,16.184971,0.867052,0.0,2.601156,1.156069,1.734104,0.0,0.0,0.0,2.890173,0.0,0.0
3,E00016442,78.527607,10.736196,0.0,0.0,0.613497,2.453988,4.294479,0.0,0.0,0.0,3.374233,0.0,0.0
4,E00016399,80.630631,5.405405,1.351351,0.0,3.603604,1.576577,4.279279,0.0,0.0,0.0,3.153153,0.0,0.0


Unnamed: 0,OA11CD,English,French,Portuguese,Spanish,Italian,German,Polish,Romanian,Lithuanian,Bulgarian,Albanian,Russian,Turkish,Arabic,Persian/Farsi,Urdu,Bengali (with Sylheti and Chatgaya),Tamil,Cantonese Chinese,Vietnamese,Somali,Akan,Yoruba,Other Languages
0,E00016403,70.570571,3.003003,2.702703,1.501502,0.3003,0.0,5.405405,0.0,0.0,0.0,0.0,0.0,0.0,1.501502,0.0,0.0,0.600601,1.501502,0.0,1.501502,1.801802,2.102102,0.900901,6.606607
1,E00016437,77.514793,2.564103,0.0,1.775148,0.197239,0.0,0.394477,0.0,0.986193,0.0,2.366864,0.0,0.394477,0.591716,0.0,0.788955,0.0,0.197239,0.394477,3.747535,0.197239,0.591716,0.0,7.29783
2,E00016439,74.566474,6.069364,3.468208,2.023121,0.867052,0.289017,0.0,0.0,1.156069,0.289017,0.867052,0.289017,0.578035,0.0,1.734104,0.0,0.289017,0.867052,0.0,1.156069,1.156069,0.289017,0.289017,3.757225
3,E00016442,78.527607,3.680982,1.533742,1.840491,1.226994,0.613497,1.226994,0.0,0.306748,0.306748,0.0,0.0,0.0,0.306748,0.306748,0.0,0.613497,1.533742,0.306748,2.760736,0.306748,0.0,1.840491,2.760736
4,E00016399,80.630631,0.675676,0.45045,1.801802,0.900901,0.0,0.45045,0.225225,0.45045,0.0,1.351351,0.45045,0.675676,0.45045,1.576577,0.900901,0.0,0.0,0.0,2.702703,1.351351,0.900901,0.225225,3.828829


'=== wd22 ==='

Unnamed: 0,WD22CD_proposed,English,Welsh/Cymraeg (in England only),Other UK language: Total,Other UK language: Gaelic (Irish),Other UK language: Gaelic (Scottish),Other UK language: Manx Gaelic,Other UK language: Gaelic (Not otherwise specified),Other UK language: Cornish,Other UK language: Scots,Other UK language: Gypsy/Traveller languages,French,Portuguese,Spanish,Other European Language (EU): Total,Other European Language (EU): Italian,Other European Language (EU): German,Other European Language (EU): Polish,Other European Language (EU): Slovak,Other European Language (EU): Czech,Other European Language (EU): Romanian,Other European Language (EU): Lithuanian,Other European Language (EU): Latvian,Other European Language (EU): Hungarian,Other European Language (EU): Bulgarian,Other European Language (EU): Greek,Other European Language (EU): Dutch,Other European Language (EU): Swedish,Other European Language (EU): Danish,Other European Language (EU): Finnish,Other European Language (EU): Estonian,Other European Language (EU): Slovenian,Other European Language (EU): Maltese,Other European Language (EU): Any other European Language (EU),Other European Language (non EU): Total,Other European Language (non EU): Albanian,Other European Language (non EU): Serbian/Croatian/Bosnian,Other European Language (non EU): Ukrainian,Other European Language (non EU): Any other Eastern European Language (non EU),Other European Language (non EU): Northern European Language (non EU),...,South Asian Language: Nepalese,South Asian Language: South Asian Language (all other),East Asian Language: Total,East Asian Language: Mandarin Chinese,East Asian Language: Cantonese Chinese,East Asian Language: All other Chinese,East Asian Language: Japanese,East Asian Language: Korean,East Asian Language: Vietnamese,East Asian Language: Thai,East Asian Language: Malay,East Asian Language: Tagalog/Filipino,East Asian Language: East Asian Language (all other),Oceanic/Australian language (any),North/South American language (any),Caribbean Creole: Total,Caribbean Creole: Caribbean Creole (English-based),Caribbean Creole: Caribbean Creole (all other),African Language: Total,African Language: Amharic,African Language: Tigrinya,African Language: Somali,African Language: Krio,African Language: Akan,African Language: Yoruba,African Language: Igbo,African Language: Swahili/Kiswahili,African Language: Luganda,African Language: Lingala,African Language: Shona,African Language: Afrikaans,African Language: Any other Nigerian language,African Language: West African language (all other),African Language: African language (all other),Other Languages: Total,Other Languages: All other languages,Sign Language: Total,Sign Language: British sign language,Sign Language: Sign Language (all other),Sign Language: Any Sign Communication System
0,E05013714,87.06888,0.0,0.009939,0.009939,0.0,0.0,0.0,0.0,0.0,0.0,1.331876,0.506908,0.367757,2.484842,0.337939,0.059636,0.715635,0.009939,0.029818,0.109333,0.487029,0.019879,0.15903,0.15903,0.139151,0.15903,0.029818,0.009939,0.019879,0.019879,0.0,0.019879,0.0,0.506908,0.447272,0.039757,0.009939,0.009939,0.0,...,0.129212,0.009939,0.974058,0.039757,0.139151,0.387635,0.009939,0.0,0.218666,0.069576,0.019879,0.079515,0.009939,0.0,0.0,0.0,0.0,0.0,1.480966,0.059636,0.039757,0.288242,0.039757,0.218666,0.218666,0.178909,0.029818,0.079515,0.009939,0.039757,0.0,0.079515,0.079515,0.119272,0.089454,0.089454,0.089454,0.079515,0.0,0.009939
1,E05013715,85.65632,0.013525,0.006763,0.006763,0.0,0.0,0.0,0.0,0.0,0.0,1.595996,0.635694,1.007642,3.557179,0.446338,0.588355,0.885913,0.027051,0.094678,0.148779,0.128491,0.060864,0.094678,0.317847,0.155542,0.202881,0.121729,0.087915,0.10144,0.033813,0.0,0.020288,0.040576,0.8521,0.628931,0.060864,0.07439,0.006763,0.081152,...,0.223169,0.07439,1.961182,0.270508,0.290796,0.764185,0.216406,0.054102,0.209644,0.07439,0.006763,0.060864,0.013525,0.0,0.0,0.0,0.0,0.0,1.149658,0.047339,0.020288,0.148779,0.020288,0.067627,0.17583,0.060864,0.189356,0.060864,0.047339,0.027051,0.128491,0.07439,0.047339,0.033813,0.094678,0.094678,0.027051,0.027051,0.0,0.0
2,E05013716,82.947594,0.025225,0.044145,0.037838,0.0,0.0,0.0,0.0,0.006306,0.0,2.036955,0.990099,1.513527,5.013559,0.826134,0.826134,1.305417,0.088289,0.081983,0.132434,0.264867,0.037838,0.327931,0.195497,0.220723,0.189191,0.220723,0.088289,0.126127,0.025225,0.018919,0.006306,0.031532,0.41622,0.151353,0.075676,0.050451,0.0,0.13874,...,0.037838,0.050451,2.314435,0.227029,0.27748,0.775683,0.428833,0.13874,0.309012,0.056757,0.06937,0.018919,0.012613,0.0,0.0,0.025225,0.012613,0.012613,1.267579,0.037838,0.063064,0.233335,0.0,0.151353,0.27748,0.132434,0.037838,0.006306,0.0,0.037838,0.050451,0.081983,0.094595,0.063064,0.100902,0.100902,0.063064,0.063064,0.0,0.0
3,E05013717,85.055386,0.012956,0.019434,0.0,0.0,0.0,0.006478,0.0,0.0,0.012956,0.881,0.595971,0.421066,3.102935,0.595971,0.259118,1.062383,0.025912,0.077735,0.194338,0.414588,0.038868,0.090691,0.058301,0.18786,0.025912,0.006478,0.025912,0.012956,0.012956,0.0,0.006478,0.006478,0.563581,0.446978,0.025912,0.058301,0.0,0.03239,...,0.045346,0.045346,0.965213,0.064779,0.142515,0.336853,0.025912,0.051824,0.194338,0.071257,0.03239,0.038868,0.006478,0.0,0.0,0.019434,0.019434,0.0,1.282633,0.025912,0.025912,0.304463,0.006478,0.129559,0.343331,0.155471,0.012956,0.019434,0.03239,0.064779,0.019434,0.038868,0.038868,0.064779,0.084213,0.084213,0.077735,0.051824,0.019434,0.006478
4,E05013718,86.332155,0.028269,0.04947,0.042403,0.0,0.0,0.007067,0.0,0.0,0.0,1.151943,0.501767,1.024735,4.862191,0.848057,0.487633,2.042403,0.063604,0.106007,0.303887,0.233216,0.007067,0.162544,0.113074,0.155477,0.127208,0.113074,0.035336,0.021201,0.028269,0.0,0.007067,0.007067,0.289753,0.183746,0.035336,0.021201,0.0,0.04947,...,0.014134,0.007067,1.180212,0.261484,0.084806,0.233216,0.204947,0.0,0.169611,0.134276,0.014134,0.070671,0.007067,0.007067,0.0,0.028269,0.028269,0.0,0.869258,0.014134,0.028269,0.09894,0.021201,0.134276,0.134276,0.091873,0.04947,0.021201,0.021201,0.070671,0.091873,0.04947,0.007067,0.035336,0.070671,0.070671,0.084806,0.063604,0.0,0.021201


Unnamed: 0,WD22CD_proposed,UK language,European Language (EU),European Language (non EU),European Language (non-national),West/Central Asian Language,South Asian Language,East Asian Language,Oceanic/Australian language (any),North/South American language (any),Caribbean Creole,African Language,Other Languages,Sign Language
0,E05013714,87.078819,4.691383,0.506908,0.0,2.514661,2.574297,0.974058,0.0,0.0,0.0,1.480966,0.089454,0.089454
1,E05013715,85.676608,6.79651,0.8521,0.0,1.697437,1.744776,1.961182,0.0,0.0,0.0,1.149658,0.094678,0.027051
2,E05013716,83.016964,9.55414,0.41622,0.0,1.809926,1.431544,2.314435,0.0,0.0,0.025225,1.267579,0.100902,0.063064
3,E05013717,85.087776,5.000972,0.563581,0.0,2.630045,4.288398,0.965213,0.0,0.0,0.019434,1.282633,0.084213,0.077735
4,E05013718,86.409894,7.540636,0.289753,0.0,1.787986,1.731449,1.180212,0.007067,0.0,0.028269,0.869258,0.070671,0.084806


Unnamed: 0,WD22CD_proposed,English,French,Portuguese,Spanish,Italian,German,Polish,Romanian,Lithuanian,Bulgarian,Albanian,Russian,Turkish,Arabic,Persian/Farsi,Urdu,Bengali (with Sylheti and Chatgaya),Tamil,Cantonese Chinese,Vietnamese,Somali,Akan,Yoruba,Other Languages
0,E05013714,87.06888,1.331876,0.506908,0.367757,0.337939,0.059636,0.715635,0.109333,0.487029,0.15903,0.447272,0.298181,1.033694,0.616241,0.188848,0.089454,0.178909,1.987874,0.139151,0.218666,0.288242,0.218666,0.218666,2.932114
1,E05013715,85.65632,1.595996,0.635694,1.007642,0.446338,0.588355,0.885913,0.148779,0.128491,0.317847,0.628931,0.331372,0.770947,0.209644,0.128491,0.10144,0.182593,0.818286,0.290796,0.209644,0.148779,0.067627,0.17583,4.524244
2,E05013716,82.947594,2.036955,0.990099,1.513527,0.826134,0.826134,1.305417,0.132434,0.264867,0.195497,0.151353,0.365769,0.674781,0.302705,0.126127,0.176578,0.252255,0.365769,0.27748,0.309012,0.233335,0.151353,0.27748,5.297345
3,E05013717,85.055386,0.881,0.595971,0.421066,0.595971,0.259118,1.062383,0.194338,0.414588,0.058301,0.446978,0.647794,1.146596,0.213772,0.304463,0.297985,0.310941,3.064067,0.142515,0.194338,0.304463,0.129559,0.343331,2.915074
4,E05013718,86.332155,1.151943,0.501767,1.024735,0.848057,0.487633,2.042403,0.303887,0.233216,0.113074,0.183746,0.162544,0.925795,0.34629,0.155477,0.332155,0.141343,0.628975,0.084806,0.169611,0.09894,0.134276,0.134276,3.462898


Lewisham English proficiency (%)
--

In [37]:
# Compute relative measures
def english_proficiency_share(d, fixed_columns, total_colname):
  total = d[total_colname]
  ds = d[fixed_columns].copy()
  var_colnames = d.drop(columns=fixed_columns + [total_colname]).columns
  for colname in var_colnames: 
    ds[colname] = d[colname] * 100. / total
  return ds

# Allocate into three broader groups
def coarse_english_proficiency_share(ds, fixed_columns):
  ds_coarse = ds[fixed_columns].copy()
  ds_coarse['English is main language'] = ds['Main language is English']
  # These span multiple columns, so we aggregate across them:
  ds_coarse['Can speak English well or very well'] = ds[d.columns[d.columns.str.contains('Can speak English')]].sum(axis=1)
  ds_coarse['Cannot speak English well or at all'] = ds[d.columns[d.columns.str.contains('Cannot speak English')]].sum(axis=1)
  return ds_coarse

In [38]:
for geog, geog_colname in zip(['oa11', 'wd22'], ['OA11CD', 'WD22CD_proposed']):
  IPython.display.display(f"=== {geog} ===")

  ep = pd.read_csv(f"{output_dir}/lbl_english_proficiency_{geog}.csv")

  # Relative measures
  d = english_proficiency_share(ep, [geog_colname], 'Total')  
  d.to_csv(f"{output_dir}/lbl_english_proficiency_share_{geog}.csv", index=False)
  IPython.display.display(d.head())

  # Coarse segmentation
  d_coarse = coarse_english_proficiency_share(d, [geog_colname])  
  d_coarse.to_csv(f"{output_dir}/lbl_english_proficiency_share_coarse_{geog}.csv", index=False)
  IPython.display.display(d_coarse.head())

'=== oa11 ==='

Unnamed: 0,OA11CD,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
0,E00016403,70.570571,11.411411,12.912913,4.804805,0.3003
1,E00016437,77.514793,5.719921,11.637081,4.142012,0.986193
2,E00016439,74.566474,11.560694,9.537572,4.046243,0.289017
3,E00016442,78.527607,10.429448,7.055215,3.680982,0.306748
4,E00016399,80.630631,6.981982,6.756757,4.954955,0.675676


Unnamed: 0,OA11CD,English is main language,Can speak English well or very well,Cannot speak English well or at all
0,E00016403,70.570571,24.324324,5.105105
1,E00016437,77.514793,17.357002,5.128205
2,E00016439,74.566474,21.098266,4.33526
3,E00016442,78.527607,17.484663,3.98773
4,E00016399,80.630631,13.738739,5.630631


'=== wd22 ==='

Unnamed: 0,WD22CD_proposed,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
0,E05013714,87.06888,5.52629,5.019382,2.166783,0.218666
1,E05013715,85.65632,6.857375,5.024684,2.008521,0.453101
2,E05013716,82.947594,9.339724,5.644195,1.772088,0.296399
3,E05013717,85.055386,6.186435,5.60342,2.824383,0.330375
4,E05013718,86.332155,6.657244,4.911661,1.922261,0.176678


Unnamed: 0,WD22CD_proposed,English is main language,Can speak English well or very well,Cannot speak English well or at all
0,E05013714,87.06888,10.545671,2.385449
1,E05013715,85.65632,11.882059,2.461622
2,E05013716,82.947594,14.983919,2.068487
3,E05013717,85.055386,11.789856,3.154758
4,E05013718,86.332155,11.568905,2.09894


Reference geo English proficiency (%)
--

In [None]:
ep = pd.read_csv(f"{output_dir}/references_english_proficiency.csv")

# Relative measures
d = english_proficiency_share(ep, ['Name', 'Code'], 'Total')  
d.to_csv(f"{output_dir}/references_english_proficiency_share.csv", index=False)
IPython.display.display(d.head())

# Coarse segmentation
d_coarse = coarse_english_proficiency_share(d, ['Name', 'Code'])  
d_coarse.to_csv(f"{output_dir}/references_english_proficiency_share_coarse.csv", index=False)
IPython.display.display(d_coarse.head())

Unnamed: 0,Name,Code,Main language is English,Main language is not English: Can speak English very well,Main language is not English: Can speak English well,Main language is not English: Cannot speak English well,Main language is not English: Cannot speak English
0,England,E92000001,92.022779,3.312196,3.010608,1.391733,0.262683
1,London,E12000007,77.89328,9.776027,8.238345,3.478809,0.613538
2,Lewisham,E09000023,83.519538,7.528522,6.035324,2.520819,0.395796


Unnamed: 0,Name,Code,English is main language,Can speak English well or very well,Cannot speak English well or at all
0,England,E92000001,92.022779,6.322804,1.654416
1,London,E12000007,77.89328,18.014372,4.092348
2,Lewisham,E09000023,83.519538,13.563846,2.916616


Results
--

In [39]:
!ls -lh '{output_dir}'

total 9.0M
-rw------- 1 root root 1.4M Nov 15 14:13 census11_english_proficiency_oa11_london.csv
-rw------- 1 root root  968 Nov 16 13:45 census11_english_proficiency_references.csv
-rw------- 1 root root 6.1M Nov 15 14:13 census11_main_language_detailed_oa11_london.csv
-rw------- 1 root root 5.6K Dec  8 14:22 lbl_english_proficiency_lsoa11.csv
-rw------- 1 root root  25K Dec  8 14:22 lbl_english_proficiency_oa11.csv
-rw------- 1 root root  55K Dec  8 14:51 lbl_english_proficiency_share_coarse_oa11.csv
-rw------- 1 root root 1.4K Dec  8 14:51 lbl_english_proficiency_share_coarse_wd22.csv
-rw------- 1 root root  81K Dec  8 14:51 lbl_english_proficiency_share_oa11.csv
-rw------- 1 root root 2.2K Dec  8 14:51 lbl_english_proficiency_share_wd22.csv
-rw------- 1 root root  980 Dec  8 14:22 lbl_english_proficiency_wd22.csv
-rw------- 1 root root 249K Dec  8 14:50 lbl_main_language_commonlangs_share_oa11.csv
-rw------- 1 root root 9.0K Dec  8 14:50 lbl_main_language_commonlangs_share_wd22.csv