In [3]:
import pandas as pd
import numpy as np

In [4]:
#Read in the csv file. Bacause the file is formatted badly, I'll specifiy the column names instead of
#letting pandas try to infer them
column_names = ['col_'+ str(x) for x in range(21)]

Province=pd.read_csv('raw_data/Province.csv', names = column_names, skiprows = 2)

In [5]:
#Let's have a look
Province.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,,,1861.0,1871.0,1881.0,1891.0,1901.0,1911.0,1926.0,1936.0,...,1961.0,1971.0,1981.0,1986.0,1991.0,1996.0,2002.0,2006.0,2011.0,2016.0
1,State,,,,,,,,,,...,,,,,,,,,,
2,,Population of Irish Speakers (Number),1077087.0,804547.0,924781.0,664387.0,619710.0,553717.0,540802.0,666601.0,...,716420.0,789429.0,1018413.0,1042701.0,1095830.0,1430205.0,1570894.0,1656790.0,1774437.0,1761420.0
3,,Population of Non-Irish Speakers (Number),3325024.0,3248640.0,2945239.0,2804307.0,2602113.0,2585971.0,2261650.0,2140324.0,...,1919398.0,1998019.0,2208054.0,2310931.0,2271176.0,2049443.0,2180101.0,2400856.0,2596194.0,2807841.0
4,,Percentage of Irish Speakers (%),24.5,19.8,23.9,19.2,19.2,17.6,19.3,23.7,...,27.2,28.3,31.6,31.1,32.5,41.1,41.9,40.8,40.6,38.5


In [109]:
#I'll create features using the entries in col_0 and col_1. Let's have a look at what's there
Province['col_1'].unique()

array([' ', nan, 'Population of Irish Speakers (Number)',
       'Population of Non-Irish Speakers (Number)',
       'Percentage of Irish Speakers (%)'], dtype=object)

In [110]:
Province['col_0'].unique()

array([' ', 'State', 'Leinster', 'Munster', 'Connacht', 'Ulster (pt)'],
      dtype=object)

In [195]:
#Now we need to remove/fill empty and null values.
#I'll fill the two blank entries in the first row with 'Year'
#Then I'll replace all blank entries in the DataFrame with np.nan so we can use the fillna() method

# Fill the first row
Province.iloc[0, 1] = 'Year'
Province.iloc[0, 0] = 'Year'

#Replace all blanks with np.nan
Province.replace(r'^\s+$', np.nan, regex=True, inplace = True)

Province.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,Year,Year,1861.0,1871.0,1881.0,1891.0,1901.0,1911.0,1926.0,1936.0,...,1961.0,1971.0,1981.0,1986.0,1991.0,1996.0,2002.0,2006.0,2011.0,2016.0
1,State,,,,,,,,,,...,,,,,,,,,,
2,,Population of Irish Speakers (Number),1077087.0,804547.0,924781.0,664387.0,619710.0,553717.0,540802.0,666601.0,...,716420.0,789429.0,1018413.0,1042701.0,1095830.0,1430205.0,1570894.0,1656790.0,1774437.0,1761420.0
3,,Population of Non-Irish Speakers (Number),3325024.0,3248640.0,2945239.0,2804307.0,2602113.0,2585971.0,2261650.0,2140324.0,...,1919398.0,1998019.0,2208054.0,2310931.0,2271176.0,2049443.0,2180101.0,2400856.0,2596194.0,2807841.0
4,,Percentage of Irish Speakers (%),24.5,19.8,23.9,19.2,19.2,17.6,19.3,23.7,...,27.2,28.3,31.6,31.1,32.5,41.1,41.9,40.8,40.6,38.5


In [196]:
#Use fillna() to replace all NaN with the last valid entry
Province['col_0'].fillna(method = 'ffill', inplace = True)

Province.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,Year,Year,1861.0,1871.0,1881.0,1891.0,1901.0,1911.0,1926.0,1936.0,...,1961.0,1971.0,1981.0,1986.0,1991.0,1996.0,2002.0,2006.0,2011.0,2016.0
1,State,,,,,,,,,,...,,,,,,,,,,
2,State,Population of Irish Speakers (Number),1077087.0,804547.0,924781.0,664387.0,619710.0,553717.0,540802.0,666601.0,...,716420.0,789429.0,1018413.0,1042701.0,1095830.0,1430205.0,1570894.0,1656790.0,1774437.0,1761420.0
3,State,Population of Non-Irish Speakers (Number),3325024.0,3248640.0,2945239.0,2804307.0,2602113.0,2585971.0,2261650.0,2140324.0,...,1919398.0,1998019.0,2208054.0,2310931.0,2271176.0,2049443.0,2180101.0,2400856.0,2596194.0,2807841.0
4,State,Percentage of Irish Speakers (%),24.5,19.8,23.9,19.2,19.2,17.6,19.3,23.7,...,27.2,28.3,31.6,31.1,32.5,41.1,41.9,40.8,40.6,38.5


In [197]:
#Finally, we just need to drop those rows full of NaN
Province.drop(Province[pd.isnull(Province['col_1'])].index, inplace = True)
Province.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,Year,Year,1861.0,1871.0,1881.0,1891.0,1901.0,1911.0,1926.0,1936.0,...,1961.0,1971.0,1981.0,1986.0,1991.0,1996.0,2002.0,2006.0,2011.0,2016.0
2,State,Population of Irish Speakers (Number),1077087.0,804547.0,924781.0,664387.0,619710.0,553717.0,540802.0,666601.0,...,716420.0,789429.0,1018413.0,1042701.0,1095830.0,1430205.0,1570894.0,1656790.0,1774437.0,1761420.0
3,State,Population of Non-Irish Speakers (Number),3325024.0,3248640.0,2945239.0,2804307.0,2602113.0,2585971.0,2261650.0,2140324.0,...,1919398.0,1998019.0,2208054.0,2310931.0,2271176.0,2049443.0,2180101.0,2400856.0,2596194.0,2807841.0
4,State,Percentage of Irish Speakers (%),24.5,19.8,23.9,19.2,19.2,17.6,19.3,23.7,...,27.2,28.3,31.6,31.1,32.5,41.1,41.9,40.8,40.6,38.5
6,Leinster,Population of Irish Speakers (Number),35704.0,16247.0,27452.0,13677.0,26436.0,40225.0,101102.0,183378.0,...,274644.0,341702.0,473225.0,480227.0,511639.0,689703.0,768404.0,823555.0,890834.0,897357.0


In [120]:
#Looks like we have a clean DataFrame now. Let confirm no NaNs are left
print('Number of Null Values: {}'.format(Province[pd.isnull(Province)].sum().sum()))

Number of Null Values: 0.0


In [198]:
#Now that we have a clean DataFrame, I'll use the pivot_table() method to convert the entries to the first
#2 columns into features
Province = pd.pivot_table(Province, columns = ['col_0', 'col_1'])
Province.head()

col_0,Connacht,Connacht,Connacht,Leinster,Leinster,Leinster,Munster,Munster,Munster,State,State,State,Ulster (pt),Ulster (pt),Ulster (pt),Year
col_1,Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Year
col_10,33.2,154187.0,309638.0,15.1,180755.0,1017491.0,22.0,189395.0,672660.0,21.2,588725.0,2182932.0,26.0,64388.0,183143.0,1946.0
col_11,37.6,148708.0,246592.0,22.2,274644.0,964383.0,28.7,228726.0,567613.0,27.2,716420.0,1919398.0,31.4,64342.0,140810.0,1961.0
col_12,37.2,137372.0,231960.0,24.5,341702.0,1055160.0,30.6,252805.0,573308.0,28.3,789429.0,1998019.0,29.5,57550.0,137591.0,1971.0
col_13,38.8,155134.0,244264.0,28.2,473225.0,1202292.0,34.6,323704.0,612526.0,31.6,1018413.0,2208054.0,30.8,66350.0,148972.0,1981.0
col_14,38.7,158386.0,250474.0,27.4,480227.0,1274353.0,34.8,337043.0,630434.0,31.1,1042701.0,2310931.0,30.1,67045.0,155670.0,1986.0


In [199]:
#I'll set the 'Year' column as the index and sort in ascending order
Province['Year'] = Province['Year'].astype('int')
Province.set_index(('Year', 'Year'), inplace = True)
Province.index.name = 'Year'
Province.sort_index(inplace = True)
Province.head()

col_0,Connacht,Connacht,Connacht,Leinster,Leinster,Leinster,Munster,Munster,Munster,State,State,State,Ulster (pt),Ulster (pt),Ulster (pt)
col_1,Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number)
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1861,44.8,409482.0,503653.0,2.4,35704.0,1421931.0,36.0,545531.0,968027.0,24.5,1077087.0,3325024.0,16.7,86370.0,431413.0
1871,39.0,330211.0,516002.0,1.2,16247.0,1323204.0,27.7,386494.0,1006991.0,19.8,804547.0,3248640.0,15.1,71595.0,402443.0
1881,44.6,366191.0,455466.0,2.1,27452.0,1251537.0,33.5,445766.0,885349.0,23.9,924781.0,2945239.0,19.5,85372.0,352887.0
1891,37.9,274783.0,449991.0,1.2,13677.0,1174083.0,26.2,307633.0,864769.0,19.2,664387.0,2804307.0,17.8,68294.0,315464.0
1901,38.0,245580.0,401352.0,2.3,26436.0,1126393.0,25.7,276268.0,799920.0,19.2,619710.0,2602113.0,20.7,71426.0,274448.0


In [200]:
#Rename the column labels
Province.columns.names = ['Province', 'Statistic']
Province.head()

Province,Connacht,Connacht,Connacht,Leinster,Leinster,Leinster,Munster,Munster,Munster,State,State,State,Ulster (pt),Ulster (pt),Ulster (pt)
Statistic,Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number),Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number)
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1861,44.8,409482.0,503653.0,2.4,35704.0,1421931.0,36.0,545531.0,968027.0,24.5,1077087.0,3325024.0,16.7,86370.0,431413.0
1871,39.0,330211.0,516002.0,1.2,16247.0,1323204.0,27.7,386494.0,1006991.0,19.8,804547.0,3248640.0,15.1,71595.0,402443.0
1881,44.6,366191.0,455466.0,2.1,27452.0,1251537.0,33.5,445766.0,885349.0,23.9,924781.0,2945239.0,19.5,85372.0,352887.0
1891,37.9,274783.0,449991.0,1.2,13677.0,1174083.0,26.2,307633.0,864769.0,19.2,664387.0,2804307.0,17.8,68294.0,315464.0
1901,38.0,245580.0,401352.0,2.3,26436.0,1126393.0,25.7,276268.0,799920.0,19.2,619710.0,2602113.0,20.7,71426.0,274448.0


In [201]:
# I want to convert the outer level of the column names (Province) to a column. So I'll convert this to
#a MultiIndex using the stack() method, then convert the inner level of the new MultiIndex to a column
Province = Province.stack(level = 0)
Province.reset_index(level=-1, inplace = True)

#Out Final DataFrame
Province.head()

Statistic,Province,Percentage of Irish Speakers (%),Population of Irish Speakers (Number),Population of Non-Irish Speakers (Number)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1861,Connacht,44.8,409482.0,503653.0
1861,Leinster,2.4,35704.0,1421931.0
1861,Munster,36.0,545531.0,968027.0
1861,State,24.5,1077087.0,3325024.0
1861,Ulster (pt),16.7,86370.0,431413.0


In [202]:
Province.rename(columns = {'Province': 'Province', 
                           'Percentage of Irish Speakers (%)':'Irish Speakers(%)', 
                           'Population of Irish Speakers (Number)':'Irish Speakers', 
                           'Population of Non-Irish Speakers (Number)':'Non-Irish Speakers'}, 
               inplace = True)
Province['Total Population'] = Province['Irish Speakers'] + Province['Non-Irish Speakers']
Province.head()

Statistic,Province,Irish Speakers(%),Irish Speakers,Non-Irish Speakers,Total Population
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1861,Connacht,44.8,409482.0,503653.0,913135.0
1861,Leinster,2.4,35704.0,1421931.0,1457635.0
1861,Munster,36.0,545531.0,968027.0,1513558.0
1861,State,24.5,1077087.0,3325024.0,4402111.0
1861,Ulster (pt),16.7,86370.0,431413.0,517783.0


In [203]:
Province.to_csv('../clean_data/Province.csv')