In [2]:
import pandas as pd
import numpy as np

In [121]:
#Read in the csv file. Bacause the file is formatted badly, I'll specifiy the column names instead of
#letting pandas try to infer them
column_names = ['col_'+ str(x) for x in range(6)]

df=pd.read_csv('raw_data/Socio Economic Sex Gaeltacht.csv', names = column_names)

In [118]:
#Let's have a look
df.head(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
0,Irish Speakers and Non-Irish Speakers Aged 3 Y...,,,,,
1,"2016 by Gaeltacht Areas, Sex, Socio Economic G...",,,,,
2,indicator and Census Year,,,,,
3,,,,,2011.0,2016.0
4,State,,,,,
5,,Both sexes,,,,
6,,,All socio-economic groups,,,
7,,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
8,,,,Irish Speakers (Number),1774437.0,1761420.0
9,,,,Non-Irish Speakers (Number),2507312.0,2667945.0


In [122]:
#Drop the first 3 rows
df.drop([0, 1, 2], axis=0, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
3,,,,,2011.0,2016.0
4,State,,,,,
5,,Both sexes,,,,
6,,,All socio-economic groups,,,
7,,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0


In [108]:
#I'll create features using the entries in col_0, col_1, col_2 and col_3. Let's look at what values are there
df['col_0'].unique()

array([' ', 'State', 'All Gaeltacht Areas',
       'When excluding not stated, people with an ability to speak Irish made ',
       'up 39.8% of the population in 2016 '], dtype=object)

In [16]:
df['col_1'].unique()

array([' ', nan, 'Both sexes', 'Male', 'Female'], dtype=object)

In [17]:
df['col_2'].unique()

array([' ', nan, 'All socio-economic groups', 'A. Employers and managers',
       'B. Higher professional', 'C. Lower professional', 'D. Non-manual',
       'E. Manual skilled', 'F. Semi-skilled', 'G. Unskilled',
       'H. Own account workers', 'I. Farmers', 'J. Agricultural workers',
       'Z. All others gainfully occupied and unknown'], dtype=object)

In [18]:
df['col_3'].unique()

array([' ', nan, 'Population Aged 3 Years and Over (Number)',
       'Irish Speakers (Number)', 'Non-Irish Speakers (Number)',
       'Not Stated (Number)',
       'Irish speakers as a percentage of total (%)'], dtype=object)

In [123]:
#Replace all blank cells with np.nan. Then we can use the fillna() method to handle them
df.replace(to_replace = r'^\s*$', regex=True, value = np.nan, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
3,,,,,2011.0,2016.0
4,State,,,,,
5,,Both sexes,,,,
6,,,All socio-economic groups,,,
7,,,,Population Aged 3 Years and Over (Number),4370631.0,4569261.0


In [124]:
#Fill NaN values in the first 4 columns with the fill forward method
df[['col_0', 'col_1', 'col_2', 'col_3']] = df[['col_0', 'col_1', 'col_2', 'col_3']].fillna(method='ffill')
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
3,,,,,2011.0,2016.0
4,State,,,,,
5,State,Both sexes,,,,
6,State,Both sexes,All socio-economic groups,,,
7,State,Both sexes,All socio-economic groups,Population Aged 3 Years and Over (Number),4370631.0,4569261.0


In [125]:
#Drop these empty rows
df.drop([4,5,6], axis=0, inplace = True)
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
3,,,,,2011.0,2016.0
7,State,Both sexes,All socio-economic groups,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
8,State,Both sexes,All socio-economic groups,Irish Speakers (Number),1774437.0,1761420.0
9,State,Both sexes,All socio-economic groups,Non-Irish Speakers (Number),2507312.0,2667945.0
10,State,Both sexes,All socio-economic groups,Not Stated (Number),88882.0,139896.0


In [126]:
#Fill the NaN values in the first row manually
df.at[3, 'col_0'] = 'Year'
df.at[3, 'col_1'] = 'Year'
df.at[3, 'col_2'] = 'Year'
df.at[3, 'col_3'] = 'Year'
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
3,Year,Year,Year,Year,2011.0,2016.0
7,State,Both sexes,All socio-economic groups,Population Aged 3 Years and Over (Number),4370631.0,4569261.0
8,State,Both sexes,All socio-economic groups,Irish Speakers (Number),1774437.0,1761420.0
9,State,Both sexes,All socio-economic groups,Non-Irish Speakers (Number),2507312.0,2667945.0
10,State,Both sexes,All socio-economic groups,Not Stated (Number),88882.0,139896.0


In [127]:
#Now use the pivot_table() mathod to convert the column entries to features
df = pd.pivot_table(df, columns = ['col_0','col_1', 'col_2', 'col_3'])
df.head()

col_0,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,...,State,State,State,State,State,State,State,State,State,Year
col_1,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,...,Male,Male,Male,Male,Male,Male,Male,Male,Male,Year
col_2,A. Employers and managers,A. Employers and managers,A. Employers and managers,A. Employers and managers,A. Employers and managers,All socio-economic groups,All socio-economic groups,All socio-economic groups,All socio-economic groups,All socio-economic groups,...,J. Agricultural workers,J. Agricultural workers,J. Agricultural workers,J. Agricultural workers,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Year
col_3,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),...,Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Year
col_4,8716.0,70.0,3660.0,84.0,12460.0,66238.0,68.5,29114.0,1276.0,96628.0,...,25.0,11342.0,177.0,15358.0,94343.0,27.0,225172.0,29309.0,348824.0,2011.0
col_5,8138.0,67.6,3824.0,77.0,12039.0,63664.0,66.3,30723.0,1703.0,96090.0,...,23.9,11295.0,187.0,15097.0,85525.0,23.4,225338.0,55095.0,365958.0,2016.0


In [128]:
# I want to use the Year column as the Index of the DataFrame.
df.set_index(('Year', 'Year','Year','Year'), inplace = True)
df.index.name = 'Year'
df.index = df.index.astype('int')
df.head()

col_0,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,All Gaeltacht Areas,...,State,State,State,State,State,State,State,State,State,State
col_1,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,Both sexes,...,Male,Male,Male,Male,Male,Male,Male,Male,Male,Male
col_2,A. Employers and managers,A. Employers and managers,A. Employers and managers,A. Employers and managers,A. Employers and managers,All socio-economic groups,All socio-economic groups,All socio-economic groups,All socio-economic groups,All socio-economic groups,...,J. Agricultural workers,J. Agricultural workers,J. Agricultural workers,J. Agricultural workers,J. Agricultural workers,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown,Z. All others gainfully occupied and unknown
col_3,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),...,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number),Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
2011,8716.0,70.0,3660.0,84.0,12460.0,66238.0,68.5,29114.0,1276.0,96628.0,...,3839.0,25.0,11342.0,177.0,15358.0,94343.0,27.0,225172.0,29309.0,348824.0
2016,8138.0,67.6,3824.0,77.0,12039.0,63664.0,66.3,30723.0,1703.0,96090.0,...,3615.0,23.9,11295.0,187.0,15097.0,85525.0,23.4,225338.0,55095.0,365958.0


In [130]:
#Use the stack() method to convert the multilevel column names to a MultiIndex
df = df.stack(level = 0)
df = df.stack(level = 0)
df = df.stack(level = 0)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,col_3,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,col_0,col_1,col_2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011,All Gaeltacht Areas,Both sexes,A. Employers and managers,8716.0,70.0,3660.0,84.0,12460.0
2011,All Gaeltacht Areas,Both sexes,All socio-economic groups,66238.0,68.5,29114.0,1276.0,96628.0
2011,All Gaeltacht Areas,Both sexes,B. Higher professional,3632.0,67.5,1719.0,30.0,5381.0
2011,All Gaeltacht Areas,Both sexes,C. Lower professional,9641.0,77.3,2769.0,57.0,12467.0
2011,All Gaeltacht Areas,Both sexes,D. Non-manual,10630.0,67.2,5082.0,109.0,15821.0


In [131]:
#Use the reset_index() method to convert the levels of the MultiIndex to columns
df.reset_index(level=-1, inplace=True)
df.reset_index(level=-1, inplace=True)
df.reset_index(level=-1, inplace=True)
df

col_3,col_0,col_1,col_2,Irish Speakers (Number),Irish speakers as a percentage of total (%),Non-Irish Speakers (Number),Not Stated (Number),Population Aged 3 Years and Over (Number)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011,All Gaeltacht Areas,Both sexes,A. Employers and managers,8716.0,70.0,3660.0,84.0,12460.0
2011,All Gaeltacht Areas,Both sexes,All socio-economic groups,66238.0,68.5,29114.0,1276.0,96628.0
2011,All Gaeltacht Areas,Both sexes,B. Higher professional,3632.0,67.5,1719.0,30.0,5381.0
2011,All Gaeltacht Areas,Both sexes,C. Lower professional,9641.0,77.3,2769.0,57.0,12467.0
2011,All Gaeltacht Areas,Both sexes,D. Non-manual,10630.0,67.2,5082.0,109.0,15821.0
...,...,...,...,...,...,...,...,...
2016,State,Male,G. Unskilled,17992.0,22.2,61753.0,1219.0,80964.0
2016,State,Male,H. Own account workers,33338.0,29.2,79692.0,1025.0,114055.0
2016,State,Male,I. Farmers,35946.0,33.8,69047.0,1416.0,106409.0
2016,State,Male,J. Agricultural workers,3615.0,23.9,11295.0,187.0,15097.0


In [132]:
#Rename the columns to something more helpful
df.columns.name = 'Statistics'
df.rename(columns = {'col_0': 'Area',
                     'col_1': 'Sex', 
                     'col_2': 'SocioEconomic Class',
                     'Irish Speakers (Number)': 'Irish Speakers', 
                     'Irish speakers as a percentage of total (%)': 'Irish Speakers(%)', 
                     'Non-Irish Speakers (Number)': 'Non-Irish Speakers',
                     'Not Stated (Number)':'Not Stated', 
                     'Population Aged 3 Years and Over (Number)': 'Population'}, 
          inplace = True)
#The Final DataFrame
df.head()

Statistics,Area,Sex,SocioEconomic Class,Irish Speakers,Irish Speakers(%),Non-Irish Speakers,Not Stated,Population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011,All Gaeltacht Areas,Both sexes,A. Employers and managers,8716.0,70.0,3660.0,84.0,12460.0
2011,All Gaeltacht Areas,Both sexes,All socio-economic groups,66238.0,68.5,29114.0,1276.0,96628.0
2011,All Gaeltacht Areas,Both sexes,B. Higher professional,3632.0,67.5,1719.0,30.0,5381.0
2011,All Gaeltacht Areas,Both sexes,C. Lower professional,9641.0,77.3,2769.0,57.0,12467.0
2011,All Gaeltacht Areas,Both sexes,D. Non-manual,10630.0,67.2,5082.0,109.0,15821.0


In [137]:
df.isnull().sum()

Statistics
Area                   0
Sex                    0
SocioEconomic Class    0
Irish Speakers         0
Irish Speakers(%)      0
Non-Irish Speakers     0
Not Stated             0
Population             0
dtype: int64

In [138]:
df.to_csv('../clean_data/SocioEconomic.csv')