# Processing real data with sequences

In [1]:
import csv

In [2]:
with open("data/pop_year.csv") as f:
    reader = csv.reader(f)
    raw_data = [r for r in reader]

In [3]:
raw_data

[['year', 'total', 'male', 'female'],
 ['  1960', '7077190 ', '3633812 ', '3443378 '],
 ['  1965', '7907113 ', '4052583 ', '3854530 '],
 ['  1970', '8603165 ', '4410996 ', '4192169 '],
 ['  1975', '9365972 ', '4782387 ', '4583585 '],
 ['  1980', '9693907 ', '4899368 ', '4794539 '],
 ['  1985', '10138642 ', '5107555 ', '5031087 '],
 ['  1990', '10662148 ', '5364342 ', '5297806 '],
 ['  1991', '10756829 ', '5410665 ', '5346164 '],
 ['  1992', '10829320 ', '5444568 ', '5384752 '],
 ['  1993', '10895987 ', '5476547 ', '5419440 '],
 ['  1994', '10912924 ', '5483699 ', '5429225 '],
 ['  1995', '10947119 ', '5495292 ', '5451827 '],
 ['  1996', '10983326 ', '5501084 ', '5482242 '],
 ['  1997', '11033993 ', '5523195 ', '5510798 '],
 ['  1998', '11076817 ', '5539960 ', '5536857 '],
 ['  1999', '11113128 ', '5553233 ', '5559895 '],
 ['  2000', '11146203 ', '5580916 ', '5565287 '],
 ['  2001', '11168526 ', '5592762 ', '5575764 '],
 ['  2002 ', '11200388 ', '5608565 ', '5591823 '],
 ['  2003 ', '11

In [None]:
# Get the column names
column_names = raw_data[0]

In [None]:
# Remove the first row, and convert other rows to numbers
data = []
for row in raw_data[1:]:
    data.append([int(row[0]), int(row[1]), int(row[2]), int(row[3])])
data

In [None]:
# other solution
data = []
for row in raw_data[1:]:
    n_row = []
    for c in row:
        n_row.append(int(c))
    data.append(n_row)
data

In [None]:
# Extract all columns to different lists
all_years = []
all_total = []
all_male = []
all_female = []
for d in data:
    all_years.append(d[0])
    all_total.append(d[1])
    all_male.append(d[2])
    all_female.append(d[3])
print(all_years)

In [None]:
# other solution, destructuring ...
all_years = []
all_total = []
all_male = []
all_female = []
for d in data:
    year, total, male, female = d
    all_years.append(year)
    all_total.append(total)
    all_male.append(male)
    all_female.append(female)
print(all_years)

In [None]:
# Verify that total is correctly calculated
for idx in range(len(all_years)):
    if all_male[idx] + all_female[idx] != all_total[idx]:
        print("ERROR in line", idx)
        break
else:
    print("No errors")

In [None]:
# Largest total population
max_pop = 0
for p in all_total:
    if p > max_pop:
        max_pop = p
max_pop

In [None]:
# other solution
max_pop = 0
for p in all_total:
    max_pop = max(max_pop, p)
max_pop

In [None]:
import sys
# Smallest male population
min_pop = sys.maxsize
for p in all_male:
    min_pop = min(min_pop, p)
min_pop

In [None]:
# Year with largest total population
max_pop = 0
max_year = None
for idx in range(len(all_years)):
    if all_total[idx] > max_pop:
        max_pop = all_total[idx]
        max_year = all_years[idx]
max_year

In [None]:
# other solution
max_idx = 0
for idx in range(1, len(all_years)):
    if all_total[idx] > all_total[max_idx]:
        max_idx = idx
all_years[max_idx]

In [None]:
# Maximum difference between two male populations
all_total_dif = []
for idx in range(1, len(all_years)):
    all_total_dif.append(abs(all_male[idx-1] - all_male[idx]))
print(all_total_dif)

In [None]:
max_total_dif = 0
for p in all_total_dif:
    max_total_dif = max(max_total_dif, p)
max_total_dif

In [None]:
# Which are the years of the maximum difference in total population?
max_dif = 0
max_dif_idx = 0
for idx in range(1, len(all_years)):
    current_dif = abs(all_total[idx-1] - all_total[idx])
    if current_dif > max_dif:
        max_dif = current_dif
        max_dif_idx = idx
max_period = (all_years[max_dif_idx-1], all_years[max_dif_idx])
max_period

In [None]:
# What is the largest difference between the male and female population
max_dif = 0
max_year = None
for idx in range(0, len(all_years)):
    current_dif = abs(all_male[idx] - all_female[idx])
    if current_dif > max_dif:
        max_dif = current_dif
        max_year = all_years[idx]
max_dif, max_year