In [2]:
import pandas as pd

# Read CSV file, skipping first 5 rows, using row 6 as header, and handling comma as thousands separator
df_pop = pd.read_csv(
    'L1_data/UK_census_population.csv',
    skiprows=5,        # Skip first 5 rows. Wnhy?
    thousands=',',     # Interpret commas as thousands separators
    header=0          # After skipping, the first row becomes the header
)

In [4]:
print(df_pop.head())

   Area code          Area name Area type  Population 2011  Population 2021  \
0  K04000001  England and Wales  National       56075912.0       59597542.0   
1  E92000001            England   Country       53012456.0       56490048.0   
2  W92000004              Wales   Country        3063456.0        3107494.0   
3  E12000001         North East    Region        2596886.0        2647013.0   
4  E12000002         North West    Region        7052177.0        7417397.0   

   Percentage change  
0                6.3  
1                6.6  
2                1.4  
3                1.9  
4                5.2  


In [7]:
list(df_pop.columns)

['Area code',
 'Area name',
 'Area type',
 'Population 2011',
 'Population 2021',
 'Percentage change']

In [6]:
print(df_pop.describe())

       Population 2011  Population 2021  Percentage change
count     3.670000e+02     3.670000e+02         367.000000
mean      6.563517e+05     6.981750e+05           6.063488
std       4.108409e+06     4.372456e+06           4.550581
min       2.203000e+03     2.054000e+03          -9.600000
25%       1.016095e+05     1.075035e+05           3.000000
50%       1.402020e+05     1.501580e+05           5.900000
75%       2.563950e+05     2.788290e+05           8.850000
max       5.607591e+07     5.959754e+07          22.100000


In [8]:
df_pop.columns = df_pop.columns.str.replace(r'\s+', '_', regex=True)
print(list(df_pop.columns)) # check again

['Area_code', 'Area_name', 'Area_type', 'Population_2011', 'Population_2021', 'Percentage_change']


In [9]:
df_pop.Area_type.value_counts()

Local Authority    355
Region               9
Country              2
National             1
Name: Area_type, dtype: int64

In [10]:
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area_code          368 non-null    object 
 1   Area_name          367 non-null    object 
 2   Area_type          367 non-null    object 
 3   Population_2011    367 non-null    float64
 4   Population_2021    367 non-null    float64
 5   Percentage_change  367 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.4+ KB


In [14]:
print(df_pop.describe(include='all'))

        Area_code          Area_name        Area_type  Population_2011  \
count         368                367              367     3.670000e+02   
unique        368                367                4              NaN   
top     K04000001  England and Wales  Local Authority              NaN   
freq            1                  1              355              NaN   
mean          NaN                NaN              NaN     6.563517e+05   
std           NaN                NaN              NaN     4.108409e+06   
min           NaN                NaN              NaN     2.203000e+03   
25%           NaN                NaN              NaN     1.016095e+05   
50%           NaN                NaN              NaN     1.402020e+05   
75%           NaN                NaN              NaN     2.563950e+05   
max           NaN                NaN              NaN     5.607591e+07   

        Population_2021  Percentage_change  
count      3.670000e+02         367.000000  
unique              N

In [5]:
print(df_pop.describe())

       Population 2011  Population 2021  Percentage change
count     3.670000e+02     3.670000e+02         367.000000
mean      6.563517e+05     6.981750e+05           6.063488
std       4.108409e+06     4.372456e+06           4.550581
min       2.203000e+03     2.054000e+03          -9.600000
25%       1.016095e+05     1.075035e+05           3.000000
50%       1.402020e+05     1.501580e+05           5.900000
75%       2.563950e+05     2.788290e+05           8.850000
max       5.607591e+07     5.959754e+07          22.100000


In [15]:
df_pop.columns = df_pop.columns.str.replace(r'\s+', '_', regex=True)

In [16]:
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area_code          368 non-null    object 
 1   Area_name          367 non-null    object 
 2   Area_type          367 non-null    object 
 3   Population_2011    367 non-null    float64
 4   Population_2021    367 non-null    float64
 5   Percentage_change  367 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.4+ KB


In [18]:
df_pop_la = df_pop[df_pop['Area_type'] == 'Local Authority']

In [20]:
# Calculate Q1, Q3, and IQR
Q1 = df_pop_la['Population_2021'].quantile(0.25)
Q3 = df_pop_la['Population_2021'].quantile(0.75)
IQR = Q3 - Q1

# Tukey's fences
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df_pop_la[
    (df_pop_la['Population_2021'] < lower_bound) |
    (df_pop_la['Population_2021'] > upper_bound)
]

print("Lower bound:", lower_bound)
print("Upper bound:", upper_bound)
print("How many outliers?", outliers.shape[0])
print("Outliers:\n", outliers)

Lower bound: -130408.0
Upper bound: 498868.0
How many outliers? 33
Outliers:
      Area_code        Area_name        Area_type  Population_2011  \
56   E06000047    County Durham  Local Authority         513242.0   
60   E06000052         Cornwall  Local Authority         532273.0   
62   E06000054        Wiltshire  Local Authority         470981.0   
68   E06000060  Buckinghamshire  Local Authority         505283.0   
254  E08000003       Manchester  Local Authority         503127.0   
270  E08000019        Sheffield  Local Authority         552698.0   
275  E08000025       Birmingham  Local Authority        1073045.0   
282  E08000032         Bradford  Local Authority         522452.0   
285  E08000035            Leeds  Local Authority         751485.0   
321  E10000003   Cambridgeshire  Local Authority         621210.0   
322  E10000006          Cumbria  Local Authority         499858.0   
323  E10000007       Derbyshire  Local Authority         769686.0   
324  E10000008           