In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
# Load the CSV file with results from the Python survey into a data frame.
py_filename = '../data/2020_sharing_data_outside.csv'

py_df = pd.read_csv(py_filename, low_memory=False)

In [None]:
py_df.head()

In [None]:
# Rename some columns

general_columns = ['age',
                   'are.you.datascientist',
                   'company.size',
                   'country.live',
                   'employment.status',
                   'first.learn.about.main.ide',
                   'how.often.use.main.ide',
                   'is.python.main',
                   'main.purposes',
                   'missing.features.main.ide',
                   'nps.main.ide',
                   'python.version.most',
                   'python.years',
                   'python2.version.most',
                   'python3.version.most',
                   'several.projects',
                   'team.size',
                   'use.python.most',
                   'years.of.coding'
                  ]

# Use the function `pd.MultiIndex.from_tuples` to create the multi-index, 
# and then reassign it back to `df.columns`. 

def column_multi_name(column_name):
    if column_name in general_columns:
        return ('general', column_name)
    else:
        first, rest = column_name.rsplit('.', 1)
        return (first, rest)
    
py_df.columns = pd.MultiIndex.from_tuples([column_multi_name(one_column_name)
                  for one_column_name in py_df.columns    ])

In [None]:
py_df.head()

In [None]:
# Sort the columns, such that they're in alphabetical order. 
py_df = py_df[sorted(py_df.columns)]

In [None]:
py_df.head()

In [None]:
# What are the 10 most popular IDEs used for editing Python?
# Python을 사용하는 응답자만 필터링
mask = (py_df.loc[:, ('general', 'is.python.main')] == 'Yes') | \
       (py_df.loc[:, ('general', 'is.python.main')] == 'No, I use Python as a secondary language')

py_df.loc[mask, ('ide', 'main')].value_counts().head(10)

In [None]:
# Python을 주요 언어로 사용하는 응답자만 필터링할 경우
mask = py_df.loc[:, ('general', 'is.python.main')] == 'Yes'

py_df.loc[mask, ('ide', 'main')].value_counts().head(10)

In [None]:
# Which 10 other programming languages are most commonly used by Python developers?
(
    py_df['other.lang']
    .count()
    .sort_values(ascending=False)
    .head(10)
)

파이썬을 주요 또는 보조 언어로 사용하는 개발자들이 가장 많이 사용하는 다른 프로그래밍 언어 10가지는 다음과 같다.

In [None]:
# Which 10 other programming languages are most commonly used by Python developers?
mask = (py_df[('general','is.python.main')] == 'Yes') | \
       (py_df[('general','is.python.main')] == 'No, I use Python as a secondary language')

(
    py_df.loc[mask, 'other.lang']
    .count()
    .sort_values(ascending=False)
    .head(10)
)

In [None]:
# What were the 10 most common countries from which survey participants came?
(
    py_df[('general', 'country.live')]
    .value_counts()
    .head(10)
)

In [None]:
# According to the Python survey, what proportion of Python developers have each level of experience?

(
    py_df[('general', 'python.years')]
    .value_counts(normalize=True)
)

In [None]:
# Which country has the greatest number of Python developers with 11+ years of experience?
(
    py_df['general']
    [py_df[('general','python.years')] == '11+ years']
    .groupby('country.live')['python.years']
    .count()
    .sort_values(ascending=False)
    .head(1)
)

In [None]:
# Proportion of Python devs per country with 11+years experience
country_experience = py_df['general'][['country.live', 'python.years']]
all_per_country = country_experience['country.live'].value_counts() 

In [None]:
all_per_country

In [None]:
expert_per_country = (country_experience
                      .loc[
                          country_experience['python.years'] == '11+ years', 
                          'country.live']
                      .value_counts()
                     )

In [None]:
expert_per_country

In [None]:
(expert_per_country / all_per_country).sort_values(ascending=False).dropna().head(10)

In [None]:
# Load the CSV file with results from the Stack Overflow survey into a data frame.
so_filename = '../data/so_2021_survey_results.csv'

so_df = pd.read_csv(so_filename, low_memory=False)

In [None]:
so_df.head()

한국, 영국, 미국의 이름을 이후에 불러오는 OECD 데이터에서 사용된 국가명과 통일시킨다.

In [None]:
to_replace_dict = {'South Korea' : 'Korea',
                   'Republic of Korea' : 'Korea',
                    'United States of America' : 'United States',
                    'United Kingdom of Great Britain and Northern Ireland' : 'United Kingdom'
                    }
so_df['Country'] = so_df['Country'].replace(to_replace=to_replace_dict)

In [None]:
# Show the average salary for different types of employment. 
# Contractors and freelancers like to say that they earn more than full-time employees. 
# What does the data here show us?

(
    so_df
    .groupby('Employment')['ConvertedCompYearly'].mean()
    .sort_values(ascending=False)
    .dropna()
    .apply(lambda n: f'{n:,.2f}')
)

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
so_df[['Country', 'EdLevel', 'ConvertedCompYearly']]

In [None]:
# Create a pivot table in which the index contains countries, 
# the columns are education levels, 
# and the cells contain the average salary for each education level per country.

(
    so_df
    .pivot_table(index='Country', 
                 columns='EdLevel', 
                 values='ConvertedCompYearly')
)

In [None]:
# Load the CSV file with OECD data
oecd_filename = '../data/oecd_locations.csv'

oecd_df = pd.read_csv(oecd_filename, header=None, index_col=1, names=['abbrev', 'Country'])

In [None]:
oecd_df

In [None]:
so_df.set_index('Country')

In [None]:
oecd_df.join(so_df.set_index('Country'))

In [None]:
# Create this pivot table again, only including countries in our OECD subset.
(
    oecd_df
    .join(so_df.set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')
)

In [None]:
oecd_df.join(so_df.set_index('Country'))

In [None]:
# In which of these countries does someone with an associate degree earn the most? 
(
    oecd_df.join(so_df.set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')['Associate degree (A.A., A.S., etc.)']
    .sort_values(ascending=False)
)

In [None]:
# In which of them does someone with a doctoral degree earn the most?
# In which of these countries does someone with an associate degree earn the most? 
(
    oecd_df
    .join(so_df.set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')['Other doctoral degree (Ph.D., Ed.D., etc.)']
    .sort_values(ascending=False)
)

In [None]:
# Remove rows from `so_df` in which `LanguageHaveWorkedWith` is `NaN`.
so_df = so_df.dropna(subset=['LanguageHaveWorkedWith'])

In [None]:
# Remove rows from `so_df` in which Python isn't included 
# as a commonly used language (`LanguageHaveWorkedWith`).
so_df = (
    so_df.loc[so_df['LanguageHaveWorkedWith'].str.contains('Python')]
)

In [None]:
# Remove rows from `so_df` in which `YearsCode` is `NaN`.
so_df = (so_df
         .dropna(subset=['YearsCode'])
        )

In [None]:
so_df.loc[so_df['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
so_df.loc[so_df['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51

In [None]:
so_df['YearsCode'] = so_df['YearsCode'].astype(int)

In [None]:
# Create a new column in `so_df`, called `experience`, which will categorize the values in `YearsCode`

so_df['experience'] = pd.cut(so_df['YearsCode'], 
                             bins=[-1, 0, 2, 5, 10, 100], 
                             labels=['Less than 1 year', '1-2 years', '3-5 years', '6-10 years', '11+ years'])

In [None]:
# According to the Python survey, what proportion of Python developers have each level of experience?
so_df['experience'].value_counts(normalize=True)

In [None]:
py_df['other.lang']