# Data Processing

## Data Summary
- Eurovision votes - votes.csv
    - Eurovisionworld fan website
    - ISO alpha-2 country codes
- Country codes
    - TODO: Find dataset
    - Can webscrape from official [ISO website](https://www.iso.org/obp/ui/#search)
    - Made country_codes_raw.csv from copy-pasting
    - Need ISO alpha-2 and ISO alpha-3
- Country capitals
    - TODO: Find dataset
- Religious demographic data
    - WRP_national.csv
    - ISO alpha-3 country codes
- GDP data
    - World_Bank_Data.csv
    - ISO alpha-3 country codes

# Utilities
- TODO: Move to .py files?

In [4]:
import pandas as pd

## Clean Country Codes Data

In [5]:
codes_raw = pd.read_csv("../data/country_codes_raw.csv")
codes_raw.head()

Unnamed: 0,name_fr,iso-alpha-2,iso-alpha-3,numeric
0,Afghanistan (l'),AF,AFG,4.0
1,,,,
2,Albanie (l'),AL,ALB,8.0
3,,,,
4,Algérie (l'),DZ,DZA,12.0


In [9]:
def get_clean_codes(codes_raw):
    codes = codes_raw.dropna()
    codes = codes[["iso-alpha-2", "iso-alpha-3"]]
    codes["iso-alpha-2"] = codes["iso-alpha-2"].str.lower()
    return codes

In [10]:
country_codes = get_clean_codes(codes_raw)
country_codes.head()

Unnamed: 0,iso-alpha-2,iso-alpha-3
0,af,AFG
2,al,ALB
4,dz,DZA
6,as,ASM
8,ad,AND


In [11]:
countries = pd.read_csv("../data/country_info.csv")
countries.head()

Unnamed: 0,name,code,capital
0,Albania,al,Tirana
1,Armenia,am,Yerevan
2,Australia,au,Canberra
3,Austria,at,Vienna
4,Azerbaijan,az,Baku


In [14]:
merged_countries = countries.merge(country_codes, left_on="code", right_on="iso-alpha-2")
merged_countries = merged_countries.drop("code", axis=1)
merged_countries.head()

Unnamed: 0,name,capital,iso-alpha-2,iso-alpha-3
0,Albania,Tirana,al,ALB
1,Armenia,Yerevan,am,ARM
2,Australia,Canberra,au,AUS
3,Austria,Vienna,at,AUT
4,Azerbaijan,Baku,az,AZE


## GDP Statistics

In [45]:
gdp_raw = pd.read_csv("../data/World_Bank_Data.csv")
gdp_raw

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,9035043,9214083,9404406,9604487,9814318,10036008,...,32792523,33831764,34700612,35688935,36743039,37856121,39068979,40000412,40578842,41454761
1,"Population, total",SP.POP.TOTL,Algeria,DZA,11424922,11628883,11800771,11982118,12179813,12365976,...,39205031,40019529,40850721,41689299,42505035,43294546,44042091,44761099,45477389,46164219
2,"Population, total",SP.POP.TOTL,Angola,AGO,5231654,5301583,5354310,5408320,5464187,5521981,...,27160769,28157798,29183070,30234839,31297155,32375632,33451132,34532429,35635029,36749906
3,"Population, total",SP.POP.TOTL,Antigua and Barbuda,ATG,55603,56540,57336,58138,59020,59970,...,88765,89409,89969,90468,90926,91364,91846,92349,92840,93316
4,"Population, total",SP.POP.TOTL,Argentina,ARG,20386045,20726276,21072538,21421705,21769453,22112629,...,43024071,43477012,43900313,44288894,44654882,44973465,45191965,45312281,45407904,45538401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9050,,,,,,,,,,,...,,,,,,,,,,
9051,,,,,,,,,,,...,,,,,,,,,,
9052,,,,,,,,,,,...,,,,,,,,,,
9053,Data from database: World Development Indicators,,,,,,,,,,...,,,,,,,,,,


In [67]:
gdp_data = gdp_raw.drop(["Country Name", "Series Code"], axis=1)
gdp_data = gdp_data.iloc[:-5]
# gdp_data.unstack()
gdp_data = gdp_data.set_index(["Country Code","Series Name"])
gdp_data = gdp_data.stack().unstack(1)
# gdp_data["Series Name"].unique()
# gdp_raw["Series Name"].unique()

gdp_data = gdp_data[['GDP per capita (current US$)', 'Population, total']].reset_index()
gdp_data["Year"] = gdp_data["level_1"].str.split().str.get(0)
gdp_data["Year"] = pd.to_numeric(gdp_data["Year"])
gdp_data = gdp_data.drop("level_1", axis=1)
gdp_data
# gdp_data[gdp_data["Country Code"] == "DEU"]

Series Name,Country Code,GDP per capita (current US$),"Population, total",Year
0,ABW,..,54922,1960
1,ABW,..,55578,1961
2,ABW,..,56320,1962
3,ABW,..,57002,1963
4,ABW,..,57619,1964
...,...,...,...,...
11579,ZWE,1684.02790388182,15271368,2019
11580,ZWE,1730.41348946953,15526888,2020
11581,ZWE,1724.38773104902,15797210,2021
11582,ZWE,2040.55245910726,16069056,2022


In [68]:
europe_gdp_data = merged_countries.merge(gdp_data, left_on="iso-alpha-3", right_on="Country Code")

In [69]:
europe_gdp_data

Unnamed: 0,name,capital,iso-alpha-2,iso-alpha-3,Country Code,GDP per capita (current US$),"Population, total",Year
0,Albania,Tirana,al,ALB,ALB,..,1608800,1960
1,Albania,Tirana,al,ALB,ALB,..,1659800,1961
2,Albania,Tirana,al,ALB,ALB,..,1711319,1962
3,Albania,Tirana,al,ALB,ALB,..,1762621,1963
4,Albania,Tirana,al,ALB,ALB,..,1814135,1964
...,...,...,...,...,...,...,...,...
3195,Turkiye,Ankara,tr,TUR,TUR,9215.44049888114,82579440,2019
3196,Turkiye,Ankara,tr,TUR,TUR,8638.73903848102,83384680,2020
3197,Turkiye,Ankara,tr,TUR,TUR,9743.21277804855,84147318,2021
3198,Turkiye,Ankara,tr,TUR,TUR,10674.504157865,84979913,2022


In [72]:
# votes.merge(europe_gdp_data, left_on=["from_country_id", "year"], right_on=["iso-alpha-2", "Year"])

Unnamed: 0,year,round,from_country_id,to_country_id,from_country,to_country,total_points,tele_points,jury_points,name,capital,iso-alpha-2,iso-alpha-3,Country Code,GDP per capita (current US$),"Population, total",Year
0,1960,final,at,fr,at,fr,1,,,Austria,Vienna,at,AUT,AUT,939.914814681174,7047539,1960
1,1960,final,at,gb,at,gb,3,,,Austria,Vienna,at,AUT,AUT,939.914814681174,7047539,1960
2,1960,final,at,mc,at,mc,0,,,Austria,Vienna,at,AUT,AUT,939.914814681174,7047539,1960
3,1960,final,at,no,at,no,1,,,Austria,Vienna,at,AUT,AUT,939.914814681174,7047539,1960
4,1960,final,at,de,at,de,2,,,Austria,Vienna,at,AUT,AUT,939.914814681174,7047539,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50362,2023,final,gb,al,gb,al,0,0.0,0.0,United Kingdom,London,gb,GBR,GBR,49463.8554617343,68350000,2023
50363,2023,final,gb,pt,gb,pt,0,0.0,0.0,United Kingdom,London,gb,GBR,GBR,49463.8554617343,68350000,2023
50364,2023,final,gb,rs,gb,rs,0,0.0,0.0,United Kingdom,London,gb,GBR,GBR,49463.8554617343,68350000,2023
50365,2023,final,gb,gb,gb,gb,0,0.0,0.0,United Kingdom,London,gb,GBR,GBR,49463.8554617343,68350000,2023


## Religious Demographic Data

In [73]:
wrp_raw = pd.read_csv("../data/WRP_national.csv")
wrp_raw.head()

Unnamed: 0,year,state,name,chrstprot,chrstcat,chrstorth,chrstang,chrstothr,chrstgen,judorth,...,othrgenpct,sumreligpct,total,dualrelig,datatype,sourcereliab,recreliab,reliabilevel,Version,sourcecode
0,1945,2,USA,66069671,38716742,1121898,2400000,1956807,110265118,821489,...,0.0039,0.9961,1.0,0,34,2,10,Medium,1.1,13
1,1950,2,USA,73090083,42635882,3045420,3045420,1177214,122994019,1078078,...,0.0041,0.9959,1.0,0,34,6,28,Low,1.1,18
2,1955,2,USA,79294628,46402368,3454916,2572767,2277091,134001770,944000,...,0.0193,0.9807,0.9999,0,134,5,10,Medium,1.1,15
3,1960,2,USA,90692928,50587880,3334535,2710065,2908939,150234347,973500,...,0.0076,0.9924,0.9999,0,134,2,10,Medium,1.1,13
4,1965,2,USA,94165803,64761783,4792868,2822149,973155,167515758,991200,...,0.003,0.997,1.0001,0,134,8,28,Low,1.1,20


In [112]:
europe_religion_data = merged_countries.merge(wrp_raw, left_on='iso-alpha-3', right_on="name")
test_data = europe_religion_data[["year", "iso-alpha-3", "chrstprot", "chrstcat"]]
# europe_religion_data

In [113]:
test_merged = test_data.merge(europe_gdp_data, how="right", left_on=["year", "iso-alpha-3"], right_on=["Year" ,"iso-alpha-3"])
test_merged = test_merged.drop("year", axis=1)
test_merged = test_merged.set_index("Year")
test_merged = test_merged.interpolate()
# test_merged["chrstprot"] = test_merged["chrstprot"].interpolate()
test_merged

  test_merged = test_merged.interpolate()


Unnamed: 0_level_0,iso-alpha-3,chrstprot,chrstcat,name,capital,iso-alpha-2,Country Code,GDP per capita (current US$),"Population, total"
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1960,ALB,0.0,119200.0,Albania,Tirana,al,ALB,..,1608800
1961,ALB,0.0,117800.0,Albania,Tirana,al,ALB,..,1659800
1962,ALB,0.0,116400.0,Albania,Tirana,al,ALB,..,1711319
1963,ALB,0.0,115000.0,Albania,Tirana,al,ALB,..,1762621
1964,ALB,0.0,113600.0,Albania,Tirana,al,ALB,..,1814135
...,...,...,...,...,...,...,...,...,...
2019,TUR,64498.0,35747.0,Turkiye,Ankara,tr,TUR,9215.44049888114,82579440
2020,TUR,64498.0,35747.0,Turkiye,Ankara,tr,TUR,8638.73903848102,83384680
2021,TUR,64498.0,35747.0,Turkiye,Ankara,tr,TUR,9743.21277804855,84147318
2022,TUR,64498.0,35747.0,Turkiye,Ankara,tr,TUR,10674.504157865,84979913


Unnamed: 0,year,round,from_country_id,to_country_id,from_country,to_country,total_points,tele_points,jury_points
0,1957,final,at,nl,at,nl,6,,
1,1957,final,at,fr,at,fr,0,,
2,1957,final,at,dk,at,dk,0,,
3,1957,final,at,lu,at,lu,3,,
4,1957,final,at,de,at,de,0,,
...,...,...,...,...,...,...,...,...,...
51349,2023,final,wld,al,wld,al,6,6.0,
51350,2023,final,wld,pt,wld,pt,0,0.0,
51351,2023,final,wld,rs,wld,rs,0,0.0,
51352,2023,final,wld,gb,wld,gb,0,0.0,
