In [1]:
import pandas as pd
import numpy as np

# 1. Importing and first look at data

In [2]:
population = pd.read_csv('LGBT_Survey_SubsetSize.csv')
display(population.shape)
population.head()

(29, 7)

Unnamed: 0,CountryID,N,Lesbian women,Gay men,Bisexual women,Bisexual men,Transgender
0,EU Total,93079,15236,57448,6424,7200,6771
1,AT,2543,437,1558,164,217,167
2,BE,2901,424,1998,132,182,165
3,BG,1033,226,474,139,88,106
4,CY,265,41,164,18,17,25


In [3]:
population['CountryID'].nunique()

29

In [4]:
population['validate_n'] = population['N'] == (population['Lesbian women'] + population['Gay men'] + population['Bisexual women'] + population['Bisexual men'] + population['Transgender'])
population

Unnamed: 0,CountryID,N,Lesbian women,Gay men,Bisexual women,Bisexual men,Transgender,validate_n
0,EU Total,93079,15236,57448,6424,7200,6771,True
1,AT,2543,437,1558,164,217,167,True
2,BE,2901,424,1998,132,182,165,True
3,BG,1033,226,474,139,88,106,True
4,CY,265,41,164,18,17,25,True
5,CZ,2469,565,1432,163,139,170,True
6,DE,20271,2445,13907,590,2000,1329,True
7,DK,1710,393,930,136,108,143,True
8,EE,374,99,170,61,20,24,True
9,GR,2760,399,1590,219,288,264,True


In [5]:
#There are 28 unique countries of origin in the study and 
#a row with the sum of all countries (EU Total)
#I've checked and validated that the column 'N' is the sum of the subsets per country (ie total population per country)

In [6]:
#Country codes: some input uses CountryID, others use Country Code, will uniformise and use CountryCode everywhere. 

In [7]:
country_codes = pd.read_excel('QES_country_codes.xlsx', sheet_name='Country_code')
display(country_codes.shape)
country_codes

(29, 3)

Unnamed: 0,CountryID,CountryCode,country_code
0,AT,Austria,austria
1,BE,Belgium,belgium
2,BG,Bulgaria,bulgaria
3,CY,Cyprus,cyprus
4,CZ,Czech_Republic,czech_republic
5,DE,Germany,germany
6,DK,Denmark,denmark
7,EE,Estonia,estonia
8,GR,Greece,greece
9,ES,Spain,spain


In [8]:
population['CountryID'] = population['CountryID'].map(country_codes.set_index('CountryID')['CountryCode'])
population = population.drop(['validate_n'],axis=1)


In [9]:
#I was going to clean up the column names (lower case/no space, etc.) but other dataframes use these columns names so doesn't make sense to rename it everywhere!
#Instead just changing names where needed. 

In [10]:
population_clean = population.rename(columns={'CountryID':'CountryCode','Lesbian women': 'Lesbian', 'Gay men':'Gay'})
population_clean.head()

Unnamed: 0,CountryCode,N,Lesbian,Gay,Bisexual women,Bisexual men,Transgender
0,EU_Total,93079,15236,57448,6424,7200,6771
1,Austria,2543,437,1558,164,217,167
2,Belgium,2901,424,1998,132,182,165
3,Bulgaria,1033,226,474,139,88,106
4,Cyprus,265,41,164,18,17,25


In [11]:
#I need to melt this table to get the count per country/subset per row, not as pivot table. 

In [12]:
population_vert = population_clean.melt(id_vars=['CountryCode'])
population_vert

Unnamed: 0,CountryCode,variable,value
0,EU_Total,N,93079
1,Austria,N,2543
2,Belgium,N,2901
3,Bulgaria,N,1033
4,Cyprus,N,265
...,...,...,...
169,Sweden,Transgender,374
170,Slovenia,Transgender,29
171,Slovakia,Transgender,115
172,United_Kingdom,Transgender,813


In [13]:
population_vert['country_subset'] = population_vert['CountryCode'] + ' ' +population_vert['variable']
population_vert

Unnamed: 0,CountryCode,variable,value,country_subset
0,EU_Total,N,93079,EU_Total N
1,Austria,N,2543,Austria N
2,Belgium,N,2901,Belgium N
3,Bulgaria,N,1033,Bulgaria N
4,Cyprus,N,265,Cyprus N
...,...,...,...,...
169,Sweden,Transgender,374,Sweden Transgender
170,Slovenia,Transgender,29,Slovenia Transgender
171,Slovakia,Transgender,115,Slovakia Transgender
172,United_Kingdom,Transgender,813,United_Kingdom Transgender


In [14]:
population_vert['country_subset'].isna().sum()

0

In [15]:
#Now I import all "response" data (except for trans specific questions as the "reference population" is different)

In [16]:
daily_life = pd.read_csv('LGBT_Survey_DailyLife.csv')
discrimination = pd.read_csv('LGBT_Survey_Discrimination.csv')
rights_awareness = pd.read_csv('LGBT_Survey_RightsAwareness.csv')
transgender_specific = pd.read_csv('LGBT_Survey_TransgenderSpecificQuestions.csv')
violence_harassment = pd.read_csv('LGBT_Survey_ViolenceAndHarassment.csv')
data=pd.concat([daily_life,discrimination,rights_awareness,violence_harassment],axis=0)
display(data.shape)
data.head()

(98920, 7)

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,notes
0,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very widespread,8,
1,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly widespread,34,
2,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly rare,45,
3,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very rare,9,
4,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Don`t know,4,[0]


In [None]:
data = data.drop(['notes'],axis=1)

In [None]:
#column notes is mostly NaNs and can't be cleaned up, so dropping it. 

In [17]:
#checking if country codes all match our population_vert (needed to match later)

In [18]:
data['CountryCode'].value_counts()

Average           3583
Germany           3566
United Kingdom    3537
Italy             3534
Spain             3529
France            3525
Sweden            3512
Finland           3491
Austria           3487
Poland            3481
Netherlands       3475
Greece            3468
Czech Republic    3454
Romania           3452
Hungary           3446
Croatia           3442
Ireland           3437
Belgium           3432
Portugal          3432
Bulgaria          3429
Denmark           3390
Slovakia          3368
Lithuania         3360
Latvia            3283
Estonia           3236
Slovenia          3222
Cyprus            3160
Malta             3153
Luxembourg        3036
Name: CountryCode, dtype: int64

In [19]:
#I can see United_kingdom and Czech_republic haven not converted and still have a space in data, need to reformat.
#There's also a CountryCode average that I will drop, as can't calculate response on it. 

In [20]:
data['CountryCode'] = np.where(data['CountryCode'].isin (["United Kingdom"]), "United_Kingdom",data['CountryCode'])
data['CountryCode'] = np.where(data['CountryCode'].isin (["Czech Republic"]), "Czech_Republic",data['CountryCode'])
data = data[data.CountryCode != "Average"]

In [21]:
data['CountryCode'].value_counts()

Germany           3566
United_Kingdom    3537
Italy             3534
Spain             3529
France            3525
Sweden            3512
Finland           3491
Austria           3487
Poland            3481
Netherlands       3475
Greece            3468
Czech_Republic    3454
Romania           3452
Hungary           3446
Croatia           3442
Ireland           3437
Belgium           3432
Portugal          3432
Bulgaria          3429
Denmark           3390
Slovakia          3368
Lithuania         3360
Latvia            3283
Estonia           3236
Slovenia          3222
Cyprus            3160
Malta             3153
Luxembourg        3036
Name: CountryCode, dtype: int64

In [None]:
#just checking

In [None]:
#Now to get actual response figures! 

In [30]:
data['country_subset'] = data['CountryCode'] + ' ' +data['subset']

In [31]:
data['pop_subset'] = data['country_subset'].map(population_vert.set_index('country_subset')['value'])

In [32]:
data.head()

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,country_subset,pop_subset
0,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very widespread,8,Austria Lesbian,437
1,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly widespread,34,Austria Lesbian,437
2,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly rare,45,Austria Lesbian,437
3,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very rare,9,Austria Lesbian,437
4,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Don`t know,4,Austria Lesbian,437


In [36]:
data.dtypes

CountryCode       object
subset            object
question_code     object
question_label    object
answer            object
percentage        object
country_subset    object
pop_subset         int64
response          object
dtype: object

In [None]:
#trying to transform percentage to integer but I have a number of rows with ":". Unfortunately I can't do anything with these (it's a large group and I can't just assign them to something without majorely skewing the data). 


In [41]:
data['percentage'].value_counts()

:      10073
2       5268
1       4869
3       4867
0       4433
       ...  
96       145
97        91
98        78
100       45
99        29
Name: percentage, Length: 102, dtype: int64

In [44]:
data = data[data.percentage != ":"]

In [45]:
data['percentage'].value_counts()

2      5268
1      4869
3      4867
0      4433
4      4241
       ... 
96      145
97       91
98       78
100      45
99       29
Name: percentage, Length: 101, dtype: int64

In [46]:
data.percentage = data['percentage'].astype(int)

In [47]:
data.dtypes

CountryCode       object
subset            object
question_code     object
question_label    object
answer            object
percentage         int64
country_subset    object
pop_subset         int64
response          object
dtype: object

In [54]:
data['response'] = round(data['pop_subset']*data['percentage']/100)

In [56]:
data.response = data['response'].astype(int)

In [57]:
data.head()

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,country_subset,pop_subset,response
0,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very widespread,8,Austria Lesbian,437,35
1,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly widespread,34,Austria Lesbian,437,149
2,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly rare,45,Austria Lesbian,437,197
3,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very rare,9,Austria Lesbian,437,39
4,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Don`t know,4,Austria Lesbian,437,17


In [58]:
#Now I have a clean set of data with no more NaNs/odd values and the actual number of responses (instead of percentage we had at the start).

(85264, 9)

In [59]:
round(data.isna().sum()/len(data),4)*100

CountryCode       0.0
subset            0.0
question_code     0.0
question_label    0.0
answer            0.0
percentage        0.0
country_subset    0.0
pop_subset        0.0
response          0.0
dtype: float64

In [60]:
data_def = data.copy()
data_def

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,country_subset,pop_subset,response
0,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very widespread,8,Austria Lesbian,437,35
1,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly widespread,34,Austria Lesbian,437,149
2,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly rare,45,Austria Lesbian,437,197
3,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very rare,9,Austria Lesbian,437,39
4,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Don`t know,4,Austria Lesbian,437,17
...,...,...,...,...,...,...,...,...,...
45320,United_Kingdom,Transgender,fb2_9,MOST SERIOUS incident of harassment - Do you t...,Lesbian,1,United_Kingdom Transgender,813,8
45321,United_Kingdom,Transgender,fb2_9,MOST SERIOUS incident of harassment - Do you t...,Gay,3,United_Kingdom Transgender,813,24
45322,United_Kingdom,Transgender,fb2_9,MOST SERIOUS incident of harassment - Do you t...,Bisexual,1,United_Kingdom Transgender,813,8
45323,United_Kingdom,Transgender,fb2_9,MOST SERIOUS incident of harassment - Do you t...,Mixed sexual orientation,3,United_Kingdom Transgender,813,24


# Saving data to.csv

In [108]:
data_def.to_csv('data_def_20220207.csv')
population_vert.to_csv('population_vert_20220207.csv')

# 2. Starting EDA with high level stats

In [101]:
country_counts = population_vert.groupby(['CountryCode']).agg({'value':sum})
country_level = pd.DataFrame(country_counts.reset_index())
country_level = country_level.sort_values(by=['value'],ascending=False)
country_level

Unnamed: 0,CountryCode,value
7,EU_Total,186158
11,Germany,40542
15,Italy,26510
10,France,16750
28,United_Kingdom,13518
26,Spain,12776
9,Finland,6878
20,Netherlands,6350
1,Belgium,5802
21,Poland,5580


In [104]:
genpop = pd.read_excel('QES_genpop.xlsx')
genpop = genpop.sort_values(by=['Population_2012'], ascending=False)
genpop

Unnamed: 0,CountryCode,Population_2012
5,Germany,80425823
11,France,65659814
27,United_Kingdom,63700215
15,Italy,59539717
9,Spain,46773055
21,Poland,38063164
23,Romania,20058035
20,Netherlands,16754962
1,Belgium,11106932
8,Greece,11045011


In [107]:
genpop['sample'] = genpop['CountryCode'].map(country_level.set_index('CountryCode')['value'])
genpop['respondents_country']= genpop['sample']/genpop['Population_2012']*100
genpop = genpop.sort_values(by=['respondents_country'], ascending=False)
genpop

Unnamed: 0,CountryCode,Population_2012,sample,respondents_country
19,Malta,420028,716,0.170465
10,Finland,5413971,6878,0.127042
17,Luxembourg,530946,636,0.119786
14,Ireland,4599533,3250,0.070659
26,Slovenia,2057159,1272,0.061833
6,Denmark,5591572,3420,0.061163
0,Austria,8429991,5086,0.060332
7,Estonia,1322696,748,0.056551
13,Croatia,4267558,2394,0.056098
16,Lithuania,2987773,1642,0.054957


In [None]:
#create visualization in tableau for background (map?)

In [99]:
subset_counts = population_vert.groupby(['variable']).agg({'value':sum})
subset_counts = pd.DataFrame(subset_counts.reset_index())
subset_counts = subset_counts.sort_values(by=['value'],ascending=False)
subset_counts

Unnamed: 0,variable,value
4,N,186158
2,Gay,114896
3,Lesbian,30472
0,Bisexual men,14400
5,Transgender,13542
1,Bisexual women,12848


In [None]:
#create visualization in tableau for background (explain assumptions about bias)