In [1]:
country_min_sample_cutoff = 3000

In [2]:
import numpy as np
import pandas as pd
import time
import pickle

from helpers import get_multiple_choice_columns, clone_and_convert_multiple_choice_columns

pd.options.display.max_rows = 2000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 255

script_start = time.time()

df = pd.read_csv('./assets/survey_results_public.csv')
schema = pd.read_csv('./assets/survey_results_schema.csv')
multiple_choice_columns = get_multiple_choice_columns()

In [3]:
countries = df['Country'].value_counts()
countries_with_enough_samples = countries[countries > (country_min_sample_cutoff)].index.values

In [4]:
start = time.time()
# country_subset = clone_and_convert_multiple_choice_columns(df.query('Country in @countries_with_enough_samples'))
country_subset = df.query('Country in @countries_with_enough_samples')
end = time.time()
print('convert multi choice time %ss' % round(end - start, 3))
print("original data set has %s rows and %s columns" % (df.shape[0], df.shape[1]))
print("convert multi choice dataset has %s rows and %s columns" % (country_subset.shape[0], country_subset.shape[1]))

convert multi choice time 0.022s
original data set has 64461 rows and 61 columns
convert multi choice dataset has 28658 rows and 61 columns


In [5]:
# {'index': 'answer', 'column': 'count'}
# ed_0_perc = answers_for_other_countries
# ed_1_perc = answers_for_country

# comp_df = pd.merge(ed_1_perc, ed_0_perc, left_index=True, right_index=True)
# comp_df.columns = ['ed_1_perc', 'ed_0_perc']
# comp_df['Diff_HigherEd_Vals'] = comp_df['ed_1_perc'] - comp_df['ed_0_perc']
# comp_df.style.bar(subset=['Diff_HigherEd_Vals'], align='mid', color=['#d65f5f', '#5fba7d'])

In [7]:
def split_results_by_on_country_v2 (df, mc_column):
    
    indexed_by_country = {}
    indexed_by_answer = {}
    def fill_answers_lookup (row):
        answer_string = row[mc_column]
        answers = [] if answer_string != answer_string else answer_string.split(';')

        for answer in answers:
            if answer not in indexed_by_country:
                indexed_by_country[answer] = {}
            if row['Country'] not in indexed_by_country[answer]:
                indexed_by_country[answer][row['Country']] = 0
            indexed_by_country[answer][row['Country']] += 1

            if row['Country'] not in indexed_by_answer:
                indexed_by_answer[row['Country']] = {}
            if answer not in indexed_by_answer[row['Country']]:
                indexed_by_answer[row['Country']][answer] = 0
            indexed_by_answer[row['Country']][answer] += 1
                        
    df.apply(fill_answers_lookup, axis=1)
    
    return pd.DataFrame(data=indexed_by_country), pd.DataFrame(data=indexed_by_answer)

indexed_by_country, indexed_by_answer = split_results_by_on_country_v2(country_subset, 'DatabaseDesireNextYear')

counts_per_answer = indexed_by_answer.sum(axis=1)
total_answer_count = counts_per_answer.sum()
proportion_per_answer = round(counts_per_answer / total_answer_count,4)

In [8]:
country = 'United States'
country_counts_per_answer = indexed_by_answer[country]
country_total_answer_count = country_counts_per_answer.sum()
country_proportion_per_answer = round(country_counts_per_answer / country_total_answer_count,4)
country_proportion_per_answer

Microsoft SQL Server    0.1036
Cassandra               0.0338
PostgreSQL              0.1703
Elasticsearch           0.0813
MongoDB                 0.1072
Redis                   0.1107
MySQL                   0.1141
Couchbase               0.0152
DynamoDB                0.0530
MariaDB                 0.0364
SQLite                  0.0914
Firebase                0.0504
Oracle                  0.0258
IBM DB2                 0.0068
Name: United States, dtype: float64

In [10]:
diff = proportion_per_answer - country_proportion_per_answer
# diff.plot.bar(align='center')
diff

pandas.core.series.Series

In [None]:
# data set : questions : (question) -> total_respondents, total_answers
# data set 2 : answers : (question, answer) -> answer_count, respondent_proportion, respondent_proportion_stddev
# data set 1 : answers by country : (country, question, answer) -> country_answers, country_respondent_proportion, country_respondent_diff_proportion 