In [2]:
import pandas as pd
import numpy as np

In [8]:
#Read in the csv file. Bacause the file is formatted badly, I'll specifiy the column names instead of
#letting pandas try to infer them
column_names = ['col_'+ str(x) for x in range(5)]

df=pd.read_csv('raw_data/Gaeltacht, Age.csv', 
               names = column_names, 
               encoding='iso8859_14')

In [9]:
#Let's have a look
df.head(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,Percentage of Irish Speakers and Non-Irish Spe...,,,,
1,"Over 2011 to 2016 by Gaeltacht Areas, Age Grou...",,,,
2,and Census Year,,,,
3,,,,2011.0,2016.0
4,State,,,,
5,,All ages,,,
6,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,,,Irish Speakers (Number),1774437.0,1761420.0
8,,,Non-Irish Speakers (Number),2507312.0,2667945.0
9,,,Not Stated (Number),88882.0,139896.0


In [10]:
#Drop the first 3 rows
df.drop([0, 1, 2], axis=0, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
3,,,,2011.0,2016.0
4,State,,,,
5,,All ages,,,
6,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,,,Irish Speakers (Number),1774437.0,1761420.0


In [11]:
#I'll create features using the entries in col_0, col_1 and col_2. Let's look at what values are there
df['col_0'].unique()

array([' ', 'State', 'All Gaeltacht Areas', 'Cork Gaeltacht Areas',
       'Donegal Gaeltacht Areas', 'Galway City Gaeltacht Areas',
       'Galway County Gaeltacht Areas', 'Kerry Gaeltacht Areas',
       'Mayo Gaeltacht Areas', 'Meath Gaeltacht Areas',
       'Waterford  Gaeltacht Areas',
       'When excluding not stated, people with an ability to speak Irish made ',
       'up 39.8% of the population in 2016 '], dtype=object)

In [12]:
df['col_1'].unique()

array([' ', nan, 'All ages', '3 - 4 years', '5 - 9 years',
       '10 - 14 years', '15 - 19 years', '20 - 24 years', '25 - 29 years',
       '30 - 34 years', '35 - 39 years', '40 - 44 years', '45 - 49 years',
       '50 - 54 years', '55 - 59 years', '60 - 64 years', '65 - 69 years',
       '70 - 74 years', '75 - 79 years', '80 - 84 years',
       '85 years and over'], dtype=object)

In [13]:
df['col_2'].unique()

array([' ', nan, 'Population Aged 3 Years and Over (Number)',
       'Irish Speakers (Number)', 'Non-Irish Speakers (Number)',
       'Not Stated (Number)',
       'Irish speakers as a percentage of total (%)'], dtype=object)

In [14]:
#Replace all blank cells with np.nan. Then we can use the fillna() method to handle them
df.replace(to_replace = r'^\s*$', regex=True, value = np.nan, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
3,,,,2011.0,2016.0
4,State,,,,
5,,All ages,,,
6,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,,,Irish Speakers (Number),1774437.0,1761420.0


In [15]:
#Fill NaN values in the first 4 columns with the fill forward method
df[['col_0', 'col_1', 'col_2']] = df[['col_0', 'col_1', 'col_2']].fillna(method='ffill')
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
3,,,,2011.0,2016.0
4,State,,,,
5,State,All ages,,,
6,State,All ages,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,State,All ages,Irish Speakers (Number),1774437.0,1761420.0


In [16]:
#Drop these empty rows
df.drop([4,5], axis=0, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
3,,,,2011.0,2016.0
6,State,All ages,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,State,All ages,Irish Speakers (Number),1774437.0,1761420.0
8,State,All ages,Non-Irish Speakers (Number),2507312.0,2667945.0
9,State,All ages,Not Stated (Number),88882.0,139896.0


In [17]:
#Fill the NaN values in the first row manually
df.at[3, 'col_0'] = 'Year'
df.at[3, 'col_1'] = 'Year'
df.at[3, 'col_2'] = 'Year'
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
3,Year,Year,Year,2011.0,2016.0
6,State,All ages,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
7,State,All ages,Irish Speakers (Number),1774437.0,1761420.0
8,State,All ages,Non-Irish Speakers (Number),2507312.0,2667945.0
9,State,All ages,Not Stated (Number),88882.0,139896.0


In [18]:
#Now use the pivot_table() mathod to convert the column entries to features
df = pd.pivot_table(df, columns = ['col_0','col_1', 'col_2'])
df.head()

col_0,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,...,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Year
col_1,10 - 14 years,10 - 14 years,10 - 14 years,10 - 14 years,10 - 14 years,15 - 19 years,15 - 19 years,15 - 19 years,15 - 19 years,15 - 19 years,...,85 years and over,85 years and over,85 years and over,85 years and over,All ages,All ages,All ages,All ages,All ages,Year
col_2,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),...,Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Year
col_3,6222.0,88.0,765.0,86.0,7073.0,5520.0,82.6,1089.0,71.0,6680.0,...,53.8,6.0,0.0,13.0,1271.0,75.1,400.0,22.0,1693.0,2011.0
col_4,5820.0,87.1,739.0,124.0,6683.0,5641.0,83.9,985.0,96.0,6722.0,...,56.5,8.0,2.0,23.0,1286.0,74.0,433.0,19.0,1738.0,2016.0


In [19]:
# I want to use the Year column as the Index of the DataFrame.
df.set_index(('Year', 'Year','Year'), inplace = True)
df.index.name = 'Year'
df.index = df.index.astype('int')
df.head()

col_0,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,...,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas,Waterford Gaeltacht Areas
col_1,10 - 14 years,10 - 14 years,10 - 14 years,10 - 14 years,10 - 14 years,15 - 19 years,15 - 19 years,15 - 19 years,15 - 19 years,15 - 19 years,...,85 years and over,85 years and over,85 years and over,85 years and over,85 years and over,All ages,All ages,All ages,All ages,All ages
col_2,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),...,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2011,6222.0,88.0,765.0,86.0,7073.0,5520.0,82.6,1089.0,71.0,6680.0,...,7.0,53.8,6.0,0.0,13.0,1271.0,75.1,400.0,22.0,1693.0
2016,5820.0,87.1,739.0,124.0,6683.0,5641.0,83.9,985.0,96.0,6722.0,...,13.0,56.5,8.0,2.0,23.0,1286.0,74.0,433.0,19.0,1738.0


In [20]:
#Use the stack() method to convert the multilevel column names to a MultiIndex
df = df.stack(level = 0)
df = df.stack(level = 0)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,col_2,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,col_0,col_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,All Gaeltacht Areas,10 - 14 years,6222.0,88.0,765.0,86.0,7073.0
2011,All Gaeltacht Areas,15 - 19 years,5520.0,82.6,1089.0,71.0,6680.0
2011,All Gaeltacht Areas,20 - 24 years,3873.0,68.2,1727.0,79.0,5679.0
2011,All Gaeltacht Areas,25 - 29 years,3609.0,58.0,2534.0,76.0,6219.0
2011,All Gaeltacht Areas,3 - 4 years,1410.0,50.3,1279.0,112.0,2801.0


In [21]:
#Use the reset_index() method to convert the levels of the MultiIndex to columns
df.reset_index(level=-1, inplace=True)
df.reset_index(level=-1, inplace=True)
df

col_2,col_0,col_1,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,All Gaeltacht Areas,10 - 14 years,6222.0,88.0,765.0,86.0,7073.0
2011,All Gaeltacht Areas,15 - 19 years,5520.0,82.6,1089.0,71.0,6680.0
2011,All Gaeltacht Areas,20 - 24 years,3873.0,68.2,1727.0,79.0,5679.0
2011,All Gaeltacht Areas,25 - 29 years,3609.0,58.0,2534.0,76.0,6219.0
2011,All Gaeltacht Areas,3 - 4 years,1410.0,50.3,1279.0,112.0,2801.0
...,...,...,...,...,...,...,...
2016,Waterford Gaeltacht Areas,70 - 74 years,30.0,50.8,29.0,0.0,59.0
2016,Waterford Gaeltacht Areas,75 - 79 years,29.0,64.4,13.0,3.0,45.0
2016,Waterford Gaeltacht Areas,80 - 84 years,21.0,61.8,12.0,1.0,34.0
2016,Waterford Gaeltacht Areas,85 years and over,13.0,56.5,8.0,2.0,23.0


In [22]:
#Rename the columns to something more helpful
df.columns.name = 'Statistics'
df.rename(columns = {'col_0': 'Gaeltacht Area',
                     'col_1': 'Age Group',
                     'Irish Speakers (Number)': 'Irish Speakers', 
                     'Irish speakers as a percentage of total (%)': 'Irish Speakers(%)', 
                     'Non-Irish Speakers (Number)': 'Non-Irish Speakers',
                     'Not Stated (Number)':'Not Stated', 
                     'Population Aged 3 Years and Over (Number)': 'Population'}, 
          inplace = True)
#The Final DataFrame
df.head()

Statistics,Gaeltacht Area,Age Group,Irish Speakers,Irish Speakers(%),Non-Irish Speakers,Not Stated,Population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,All Gaeltacht Areas,10 - 14 years,6222.0,88.0,765.0,86.0,7073.0
2011,All Gaeltacht Areas,15 - 19 years,5520.0,82.6,1089.0,71.0,6680.0
2011,All Gaeltacht Areas,20 - 24 years,3873.0,68.2,1727.0,79.0,5679.0
2011,All Gaeltacht Areas,25 - 29 years,3609.0,58.0,2534.0,76.0,6219.0
2011,All Gaeltacht Areas,3 - 4 years,1410.0,50.3,1279.0,112.0,2801.0


In [23]:
df.isnull().sum()

Statistics
Gaeltacht Area        0
Age Group             0
Irish Speakers        0
Irish Speakers(%)     0
Non-Irish Speakers    0
Not Stated            0
Population            0
dtype: int64

In [24]:
df.to_csv('../clean_data/AgeGroup.csv')