# Popultaion analysis 

In [249]:
# Importing necessary libraries

# testing git

import seaborn as sns 
import pandas as pd

In [250]:
df = pd.read_csv('population_data.csv')

In [251]:
df.head(5)

# An inital look at the data highlights some redundant columns; namely 'STATISTIC Label','Sex' and 'UNIT' which are identical 
# for each row and as such do not add any analytical value.

# A NaN value is also observed in row 1, column 'UNIT' which must be explored further. 

# Data repetition is also seen in the 'Age Group' column, with data redunacy seen with age groupings, i.e. the column 
# '0 - 14 years' is the culmination of the columns 'Under 1 year', '1 - 4 years', '5 - 9 years' and '10 - 14 years'.

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,UNIT,VALUE
0,Population Estimates (Persons in April),1950,Under 1 year,Both sexes,Thousand,61.1
1,Population Estimates (Persons in April),1950,0 - 4 years,Both sexes,Thousand,
2,Population Estimates (Persons in April),1950,0 - 14 years,Both sexes,Thousand,851.2
3,Population Estimates (Persons in April),1950,1 - 4 years,Both sexes,Thousand,249.1
4,Population Estimates (Persons in April),1950,5 - 9 years,Both sexes,Thousand,278.9


In [252]:
df.info() 

# This function demonstrates that all columns with the exception of the 'VALUE' column have data for each row. #
# This function only highlights NaN values and does not speak to data quality or format.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1776 entries, 0 to 1775
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  1776 non-null   object 
 1   Year             1776 non-null   int64  
 2   Age Group        1776 non-null   object 
 3   Sex              1776 non-null   object 
 4   UNIT             1776 non-null   object 
 5   VALUE            1738 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 83.4+ KB


In [253]:
df['STATISTIC Label'].unique()
df['UNIT'].unique()
df['Sex'].unique()

# From the df.head() analysis above, it appeared that the columns 'STATISTIC Label','Sex' and 'UNIT' were column constants and  
#  as such would be of little analytial value; however before dropping these rows, the above code was run to ensure there was  
#  no additional data in these columns. Once this was confirmed, the columns were dropped. 

irrelivant_columns = ['STATISTIC Label','UNIT', 'Sex']
df.drop(columns=irrelivant_columns, inplace=True)

In [254]:
df.head(3) # Confirms columns were successfully dropped.

Unnamed: 0,Year,Age Group,VALUE
0,1950,Under 1 year,61.1
1,1950,0 - 4 years,
2,1950,0 - 14 years,851.2


In [255]:
# Once the constant values were removed, null values were addressed. 

null_values = df['VALUE'].isnull()
null_df = df[null_values]
#null_df.tail(5)
null_df.tail(5)

Unnamed: 0,Year,Age Group,VALUE
961,1990,0 - 4 years,
1009,1992,0 - 4 years,
1033,1993,0 - 4 years,
1057,1994,0 - 4 years,
1081,1995,0 - 4 years,


The above code suggestests that before 1995, census data was not taken for individuals aged 0-4 years old, however this seems ilogical; 

In [259]:
nineteen_fifty_df = df[df['Year'] == '1993']
nineteen_fifty_df

Unnamed: 0,Year,Age Group,VALUE


In [258]:
age_possibilities = df['Year'].unique()
age_possibilities # discussed below 

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023], dtype=int64)

When we demonstrate the age ranges possible, we can see that there is some overlap, for example, '15 - 19 years' and '15 - 24 years'. To deal with the issue of data replication, columns which would result in data replication were removed. 

We have already demonstarted above that there are a number of null values for individuals aged 0-4, however, the same data is contained within the two attributes 'Under 1 year' and '1-4' years. For this reason, the '0-4' column can be dropped. 

In [243]:
# df['Age Group'].value_counts()

In [244]:
df = pd.pivot_table(df, values='VALUE', index='Year', columns='Age Group')
zero_to_five_column = df[('Under 1 year')] + df[('1 - 4 years')]
df['0 - 5 years'] = zero_to_five_column
df

Age Group,0 - 14 years,0 - 4 years,1 - 4 years,10 - 14 years,15 - 19 years,15 - 24 years,20 - 24 years,25 - 29 years,25 - 44 years,30 - 34 years,...,50 - 54 years,55 - 59 years,60 - 64 years,65 - 69 years,70 - 74 years,75 - 79 years,80 - 84 years,85 years and over,Under 1 year,0 - 5 years
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1950,851.2,,249.1,262.1,244.0,452.6,208.6,201.1,773.6,194.8,...,158.4,130.4,121.9,109.3,100.4,64.0,30.1,13.3,61.1,310.2
1951,854.8,312.9,249.3,260.9,241.2,443.4,202.2,198.4,771.2,191.6,...,163.0,128.8,122.1,107.5,100.1,64.6,30.9,13.3,63.6,312.9
1952,859.6,,248.4,264.6,239.7,437.5,197.8,192.9,760.3,188.6,...,160.0,133.8,121.2,107.9,98.8,64.9,31.3,14.0,61.9,310.3
1953,865.3,,248.1,268.6,238.6,432.3,193.7,187.7,750.6,185.8,...,157.3,138.8,120.5,108.6,97.7,65.4,31.6,14.5,60.1,308.2
1954,870.0,,245.8,272.1,237.1,426.5,189.4,182.2,739.7,182.8,...,154.4,143.6,119.7,109.0,96.3,65.8,32.1,15.1,60.1,305.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,1015.5,315.0,254.6,344.7,322.1,623.8,301.7,291.5,1406.0,333.7,...,313.4,288.1,255.9,221.6,187.1,130.3,88.2,74.1,60.4,315.0
2020,1013.7,308.7,250.5,354.7,327.4,637.9,310.5,296.4,1412.0,331.6,...,319.9,296.0,261.6,228.7,192.5,138.5,90.8,75.6,58.2,308.7
2021,1007.6,300.9,245.0,364.0,329.0,640.4,311.4,294.8,1411.9,330.5,...,329.5,301.2,268.5,234.1,196.2,147.2,92.4,77.5,55.9,300.9
2022,1014.3,295.8,238.2,374.9,337.8,649.7,311.9,299.1,1434.2,334.5,...,342.9,309.7,275.8,240.5,204.7,155.0,96.8,84.4,57.7,295.9


In [245]:
#df = df.drop(df[df['Age Group'] == '0 - 4 years'].index)
#df = df.drop(df[df['Age Group'] == 'Under 1 year'].index)

In [246]:
df['Age Group'].value_counts()

KeyError: 'Age Group'

# Machine learning 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df.shape


In [None]:
age_possibilities = df['Age Group'].unique()
age_possibilities

In the above cell, it is seen that data redundancy is seen, i.e. the column '0 - 14 years' is the culmination of several other columns 'Under 1 year', '1 - 4 years', '5 - 9 years' and '10 - 14 years'. For this reason, the column was dropped. 

In [None]:
df = df.drop(df[df['Age Group'] == '0 - 14 years'].index)
df = df.drop(df[df['Age Group'] == '15 - 24 years'].index)
df = df.drop(df[df['Age Group'] == '25 - 44 years'].index)
df = df.drop(df[df['Age Group'] == '45 - 64 years'].index)

In [None]:
df['Age Group'].unique()

In [261]:
git remote add origin https://github.com/joephelan410/msc_projects.git

SyntaxError: invalid syntax (109470751.py, line 1)

In [262]:

!git add .
!git commit -m "Your commit message"
!git push origin master


On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../.bash_history
	../.conda/
	../.condarc
	../.continuum/
	../.dotnet/
	../.gitconfig
	../.ipynb_checkpoints/
	../.ipython/
	../.jupyter/
	../.librarymanager/
	../.matplotlib/
	../.vscode/
	../3D Objects/
	../AppData/
	../Bootcamp day 1/
	../Contacts/
	../DataVisPrepMSc/
	../Desktop.ipynb
	../Desktop/
	../Documents/
	../Downloads/
	../Favorites/
	../GroupProjectDataWrangling.ipynb
	../Links/
	../Music/
	../NTUSER.DAT
	../NTUSER.DAT{240a98a4-74ca-11ed-b214-d44f7f6f863a}.TM.blf
	../NTUSER.DAT{240a98a4-74ca-11ed-b214-d44f7f6f863a}.TMContainer00000000000000000001.regtrans-ms
	../NTUSER.DAT{240a98a4-74ca-11ed-b214-d44f7f6f863a}.TMContainer00000000000000000002.regtrans-ms
	../OneDrive/
	../Pictures/
	../PycharmProjects/
	../Saved Games/
	../Searches/
	../Statistics/
	../Untitled.ipynb
	../Untitled1.ipynb
	../Untitled2.ipynb
	../Videos/
	../anaconda3/
	../individual_assignment_Joe_phelan.ipynb


fatal: 'origin' does not appear to be a git repository
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.
