# Processing real data with comprehensions and enumerators

In [27]:
import csv

In [28]:
with open("data/pop_year.csv") as f:
    reader = csv.reader(f)
    raw_data = [r for r in reader]

In [29]:
raw_data

[['year', 'total', 'male', 'female'],
 ['  1960', '7077190 ', '3633812 ', '3443378 '],
 ['  1965', '7907113 ', '4052583 ', '3854530 '],
 ['  1970', '8603165 ', '4410996 ', '4192169 '],
 ['  1975', '9365972 ', '4782387 ', '4583585 '],
 ['  1980', '9693907 ', '4899368 ', '4794539 '],
 ['  1985', '10138642 ', '5107555 ', '5031087 '],
 ['  1990', '10662148 ', '5364342 ', '5297806 '],
 ['  1991', '10756829 ', '5410665 ', '5346164 '],
 ['  1992', '10829320 ', '5444568 ', '5384752 '],
 ['  1993', '10895987 ', '5476547 ', '5419440 '],
 ['  1994', '10912924 ', '5483699 ', '5429225 '],
 ['  1995', '10947119 ', '5495292 ', '5451827 '],
 ['  1996', '10983326 ', '5501084 ', '5482242 '],
 ['  1997', '11033993 ', '5523195 ', '5510798 '],
 ['  1998', '11076817 ', '5539960 ', '5536857 '],
 ['  1999', '11113128 ', '5553233 ', '5559895 '],
 ['  2000', '11146203 ', '5580916 ', '5565287 '],
 ['  2001', '11168526 ', '5592762 ', '5575764 '],
 ['  2002 ', '11200388 ', '5608565 ', '5591823 '],
 ['  2003 ', '11

In [30]:
# Remove the first row, and convert other rows to numbers
data = []
for row in raw_data[1:]:
    data.append([int(r) for r in row])
data

[[1960, 7077190, 3633812, 3443378],
 [1965, 7907113, 4052583, 3854530],
 [1970, 8603165, 4410996, 4192169],
 [1975, 9365972, 4782387, 4583585],
 [1980, 9693907, 4899368, 4794539],
 [1985, 10138642, 5107555, 5031087],
 [1990, 10662148, 5364342, 5297806],
 [1991, 10756829, 5410665, 5346164],
 [1992, 10829320, 5444568, 5384752],
 [1993, 10895987, 5476547, 5419440],
 [1994, 10912924, 5483699, 5429225],
 [1995, 10947119, 5495292, 5451827],
 [1996, 10983326, 5501084, 5482242],
 [1997, 11033993, 5523195, 5510798],
 [1998, 11076817, 5539960, 5536857],
 [1999, 11113128, 5553233, 5559895],
 [2000, 11146203, 5580916, 5565287],
 [2001, 11168526, 5592762, 5575764],
 [2002, 11200388, 5608565, 5591823],
 [2003, 11215388, 5613301, 5602087],
 [2004, 11217590, 5612160, 5605430],
 [2005, 11218623, 5617802, 5600821],
 [2006, 11202632, 5610040, 5592592],
 [2007, 11188028, 5601472, 5586556],
 [2008, 11173996, 5593875, 5580121],
 [2009, 11174952, 5595132, 5579820],
 [2010, 11167934, 5590326, 5577608],
 [2011

In [31]:
# or better ....
data = [[int(r) for r in row] for row in raw_data[1:]]
data

[[1960, 7077190, 3633812, 3443378],
 [1965, 7907113, 4052583, 3854530],
 [1970, 8603165, 4410996, 4192169],
 [1975, 9365972, 4782387, 4583585],
 [1980, 9693907, 4899368, 4794539],
 [1985, 10138642, 5107555, 5031087],
 [1990, 10662148, 5364342, 5297806],
 [1991, 10756829, 5410665, 5346164],
 [1992, 10829320, 5444568, 5384752],
 [1993, 10895987, 5476547, 5419440],
 [1994, 10912924, 5483699, 5429225],
 [1995, 10947119, 5495292, 5451827],
 [1996, 10983326, 5501084, 5482242],
 [1997, 11033993, 5523195, 5510798],
 [1998, 11076817, 5539960, 5536857],
 [1999, 11113128, 5553233, 5559895],
 [2000, 11146203, 5580916, 5565287],
 [2001, 11168526, 5592762, 5575764],
 [2002, 11200388, 5608565, 5591823],
 [2003, 11215388, 5613301, 5602087],
 [2004, 11217590, 5612160, 5605430],
 [2005, 11218623, 5617802, 5600821],
 [2006, 11202632, 5610040, 5592592],
 [2007, 11188028, 5601472, 5586556],
 [2008, 11173996, 5593875, 5580121],
 [2009, 11174952, 5595132, 5579820],
 [2010, 11167934, 5590326, 5577608],
 [2011

In [32]:
# Extract all columns to different lists
all_years = []
all_total = []
all_male = []
all_female = []
for year, total, male, female in data:
    all_years.append(year)
    all_total.append(total)
    all_male.append(male)
    all_female.append(female)
print(all_years)

[1960, 1965, 1970, 1975, 1980, 1985, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


In [33]:
# Verify if all totals are correctly calculated
if any(male+female != total for male, female, total 
        in zip(all_male, all_female, all_total)):
    print("ERRORS")
else:
    print("No Errors")

No Errors


In [34]:
# Largest total population
max(all_total)

11239224

In [35]:
# Largest male population
max(all_male)

5617802

In [36]:
# Smallest male population
min(all_male)

3633812

In [37]:
# Year with larger total population
idx_of_max, _ = max(enumerate(all_total), key=lambda x:x[1])

all_total[idx_of_max], all_years[idx_of_max]

(11239224, 2016)

In [38]:
# Maximum difference between two male populations
max((abs(v - v2) for v, v2 in zip(all_male, all_male[1:])))

418771

In [39]:
# Which are the years of the maximum difference in total population?
all_delta = ((abs(v - v2), (y, y2)) for v, v2, y, y2
     in zip(all_male, all_male[1:], all_years, all_years[1:]))
max(all_delta, key= lambda x:x[0])

(418771, (1960, 1965))

In [40]:
# What is the largest difference between the male and female population
max((abs(m-f) for m, f in zip(all_male, all_female)))

218827