# Nobel Prize Winners (1900-2020)

In [None]:
import csv
with open("data/nobel_prize_by_winner.csv", encoding="cp1256") as f:
    reader = csv.reader(f)
    raw_data = [r for r in reader]
raw_data[:2]

In [None]:
# lets parse the information
from datetime import datetime 
import re

def parse_data(data, cols_to_use, convert_fn):
    col_idxs = {r: idx for idx, r in enumerate(data[0])}
    result = []
    for d in data[1:]:
        if d[1] == "" or d[3] == "":
            continue
        row = []
        for col_name in cols_to_use:
            raw_value = d[col_idxs[col_name]] 
            row.append(convert_fn(col_name, raw_value))
        result.append(row)
    return result, {col_name: idx for idx, col_name in enumerate(cols_to_use)}
            
def convert_field_fn(field_name, value):
    if field_name in {'born', 'died'}:
        comps = re.split(r'[/-]', value)
        if comps[0] == '0000' and comps[1] == '00' and comps[2] == '00':
            return None
        return int(comps[-1])
    if field_name == 'year':
        return int(value)
    return value

data, col_idx = parse_data(raw_data, ['firstname',
                      'surname',
                      'born',
                      'died',
                      'bornCountry',
                      'bornCountryCode',
                      'diedCountry',
                      'diedCountryCode',
                      'gender',
                      'year',
                      'category'], convert_field_fn)
data[:3]

In [None]:
col_idx.keys()

In [None]:
# A algunos les falta el año de nacimiento ... pasa siempre con datos reales
data = [d for d in data if d[col_idx['born']] is not None]

In [None]:
# distribucion por sexo
from collections import Counter

def get_column_values(data, col_name):
    return [d[col_idx[col_name]] for d in data]
    
Counter(get_column_values(data, 'gender'))

In [None]:
# Idem, pero antes y despues de 1990
def filter_rows(data, fn_select):
    return [d for d in data if fn_select(d)]

rows_before = filter_rows(data, lambda row: row[col_idx['year']] <= 1990)
Counter(get_column_values(rows_before, 'gender'))

In [None]:
rows_after = filter_rows(data, lambda row: row[col_idx['year']] > 1990)
Counter(get_column_values(rows_after, 'gender'))

In [None]:
# histograma de edades al recibir el Nobel
all_ages = [row[col_idx['year']] - row[col_idx['born']] for row in data ]
all_ages[:5]

In [None]:
import matplotlib.pyplot as plt

def create_histogram(data, min_value, max_value):
    result = list((0 for _ in range(max_value + min_value + 1)))
    for d in data:
        if d < min_value:
            idx = 0
        elif d >= max_value:
            idx = -1
        else:
            idx = d - min_value
        result[idx] += 1
    return result
                  
age_hist = create_histogram(all_ages, 1, 90)
plt.plot(age_hist)


In [None]:
# Por cada categoria, edad promedio al alcanzar el Nobel
from collections import defaultdict

by_category = defaultdict(lambda: [])
for d in data:
    category = d[col_idx['category']]
    age = d[col_idx['year']] - d[col_idx['born']]
    by_category[category].append(age)
by_category

In [None]:
[(cat, sum(ages) / len(ages), min(ages), max(ages)) for cat, ages in by_category.items()]