## Globals

In [42]:
import pandas as pd
import numpy as np
import os

datasets=["edu","inc", "urb"]
root=os.getcwd().removesuffix("sandbox")

## Loading clean datasets

In [51]:
clean_folder=root+"data/datasets_clean/"
cleans={}

# Loading clean datasets to memory
for dataset in ["education","income","urbanisation"]:
    ds_path=clean_folder+"freq_"+str(dataset)+"_cleaned.csv"
    print(ds_path)
    cleans[dataset[:3]]=pd.read_csv(ds_path,sep=";",index_col=0)

cleans["edu"]
    

/home/meru/University/5thYear/IDS/datascience-miniproject/data/datasets_clean/freq_education_cleaned.csv
/home/meru/University/5thYear/IDS/datascience-miniproject/data/datasets_clean/freq_income_cleaned.csv
/home/meru/University/5thYear/IDS/datascience-miniproject/data/datasets_clean/freq_urbanisation_cleaned.csv


Unnamed: 0,age,geo,sex,isced11,Every day,Every month,Every week,Less than once a month,Never or not in the last 12 months
0,75 years or over,Austria,Females,"Less than primary, primary and lower secondary...",5.0,22.8,12.3,16.8,43.1
1,75 years or over,Austria,Females,Tertiary education (levels 5-8),7.5,35.1,16.8,7.4,33.2
2,75 years or over,Austria,Females,Upper secondary and post-secondary non-tertiar...,8.1,23.3,23.4,19.9,25.3
6,75 years or over,Austria,Males,"Less than primary, primary and lower secondary...",29.7,17.4,37.5,2.9,12.4
7,75 years or over,Austria,Males,Tertiary education (levels 5-8),26.2,26.2,29.1,12.7,5.8
...,...,...,...,...,...,...,...,...,...
2389,From 65 to 74 years,United Kingdom,Females,Tertiary education (levels 5-8),13.4,11.7,51.0,11.8,12.0
2390,From 65 to 74 years,United Kingdom,Females,Upper secondary and post-secondary non-tertiar...,9.3,11.0,39.9,19.9,19.9
2391,From 65 to 74 years,United Kingdom,Males,"Less than primary, primary and lower secondary...",12.8,15.9,42.3,11.0,18.1
2392,From 65 to 74 years,United Kingdom,Males,Tertiary education (levels 5-8),23.6,8.9,57.5,4.5,5.5


## Loading clusters

In [20]:
clu_folder=root+"data/clusters/"
clusters={}

number_of_clusters=[]

# Loading all clusters to memory
for dataset in datasets:
    cluster_number=0
    while True:
        cluster_path=clu_folder+dataset+"_clu_"+str(cluster_number)+".csv"
        if os.path.isfile(cluster_path):
            #print(cluster_path)
            clusters[(dataset,cluster_number)]=pd.read_csv(cluster_path,sep=";",index_col=0)
            cluster_number+=1
        else:
            number_of_clusters.append(cluster_number)
            break

clusters[("edu",3)]

Unnamed: 0,age,geo,sex,isced11,Every day,Every month,Every week,Less than once a month,Never or not in the last 12 months,clusters
13,75 years or over,Belgium,Females,Tertiary education (levels 5-8),12.0,29.8,27.9,9.2,21.1,3
110,75 years or over,Germany,Females,Upper secondary and post-secondary non-tertiar...,8.3,26.5,25.2,13.8,26.2,3
342,From 15 to 24 years,Austria,Females,"Less than primary, primary and lower secondary...",0.0,41.4,17.6,14.9,26.1,3
343,From 15 to 24 years,Austria,Females,Tertiary education (levels 5-8),0.0,52.9,22.1,15.7,9.3,3
344,From 15 to 24 years,Austria,Females,Upper secondary and post-secondary non-tertiar...,0.0,43.0,23.1,18.8,15.1,3
...,...,...,...,...,...,...,...,...,...,...
2280,From 65 to 74 years,Norway,Males,"Less than primary, primary and lower secondary...",1.7,43.2,42.0,0.0,13.1,3
2293,From 65 to 74 years,Poland,Males,Tertiary education (levels 5-8),4.1,34.0,19.4,25.8,16.8,3
2318,From 65 to 74 years,Romania,Males,Upper secondary and post-secondary non-tertiar...,10.0,31.8,22.7,7.1,28.4,3
2335,From 65 to 74 years,Slovakia,Males,Tertiary education (levels 5-8),2.6,24.1,26.2,28.1,19.0,3


## Verifying sex ratios

In [56]:
def get_sex_count(df):
    counts=df["sex"].value_counts()
    F=int(counts["Females"])
    M=int(counts["Males"])
    return F,M

In [61]:
for dataset,n in list(zip(datasets,number_of_clusters)):
    print(dataset)
    F,M=get_sex_count(cleans[dataset])
    print("Sex counts: ",F,M)
    Fs=[]
    Ms=[]
    for cluster_number in range(n):
        key=(dataset,cluster_number)
        print(key,end=":  ")
        f,m=get_sex_count(clusters[key])
        print(f,m)
        Fs.append(f)
        Ms.append(m)
    print(sum(Fs), sum(Ms))
    print()

edu
Sex counts:  588 588
('edu', 0):  23 17
('edu', 1):  87 298
('edu', 2):  104 28
('edu', 3):  176 163
('edu', 4):  198 82
588 588

inc
Sex counts:  980 980
('inc', 0):  49 138
('inc', 1):  173 41
('inc', 2):  127 413
('inc', 3):  284 251
('inc', 4):  347 137
980 980

urb
Sex counts:  588 588
('urb', 0):  246 141
('urb', 1):  108 260
('urb', 2):  23 25
('urb', 3):  15 134
('urb', 4):  196 28
588 588



## Verifying ages

In [73]:
labels=["From 15 to 24 years","From 25 to 34 years","From 35 to 44 years","From 45 to 54 years", "From 55 to 64 years","From 65 to 74 years","75 years or over"]

def get_age_count(df):
    counts=df["age"].value_counts()
    ages=[]
    for i in range(len(labels)):
        ages.append(int(counts.get(labels[i],0)))
    return ages


for dataset,n in list(zip(datasets,number_of_clusters)):
    print(dataset)
    ages_total=get_age_count(cleans[dataset])
    print("Age counts: ",ages_total)
    
    ages=[]
    for cluster_number in range(n):
        key=(dataset,cluster_number)
        print(key,end=":  ")
        counts=get_age_count(clusters[key])
        print(counts)
        ages.append(counts)
    
    sums=[]
    for age_group in range(len(ages_total)):
        aux=0
        for i in range(n):
            aux+=ages[i][age_group]
        sums.append(aux)
    print("Sums are: ", sums)
    print()

edu
Age counts:  [168, 168, 168, 168, 168, 168, 168]
('edu', 0):  [13, 8, 2, 5, 0, 2, 10]
('edu', 1):  [22, 39, 57, 66, 82, 74, 45]
('edu', 2):  [25, 12, 11, 11, 16, 21, 36]
('edu', 3):  [76, 82, 59, 57, 36, 27, 2]
('edu', 4):  [32, 27, 39, 29, 34, 44, 75]
Sums are:  [168, 168, 168, 168, 168, 168, 168]

inc
Age counts:  [280, 280, 280, 280, 280, 280, 280]
('inc', 0):  [11, 1, 6, 7, 30, 45, 87]
('inc', 1):  [29, 15, 15, 16, 24, 41, 74]
('inc', 2):  [37, 72, 93, 115, 107, 84, 32]
('inc', 3):  [104, 140, 112, 83, 53, 39, 4]
('inc', 4):  [99, 52, 54, 59, 66, 71, 83]
Sums are:  [280, 280, 280, 280, 280, 280, 280]

urb
Age counts:  [168, 168, 168, 168, 168, 168, 168]
('urb', 0):  [96, 80, 63, 49, 37, 36, 26]
('urb', 1):  [31, 69, 77, 84, 66, 35, 6]
('urb', 2):  [7, 6, 7, 6, 6, 6, 10]
('urb', 3):  [0, 0, 8, 15, 32, 50, 44]
('urb', 4):  [34, 13, 13, 14, 27, 41, 82]
Sums are:  [168, 168, 168, 168, 168, 168, 168]

