In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# Load the CSV file with results from the Python survey into a data frame.
py_filename = '../data/2020_sharing_data_outside.csv'

py_df = pd.read_csv(py_filename, low_memory=False)

In [4]:
# rename some columns

general_columns = ['age',
                   'are.you.datascientist',
                   'is.python.main',
                   'company.size',
                   'country.live',
                   'employment.status',
                   'first.learn.about.main.ide',
                   'how.often.use.main.ide',
                   'is.python.main',
                   'main.purposes'
                   'missing.features.main.ide'
                   'nps.main.ide',
                   'python.version.most',
                   'python.years',
                   'python2.version.most',
                   'python3.version.most',
                   'several.projects',
                   'team.size',
                   'use.python.most',
                   'years.of.coding'
                  ]

# Use the function `pd.MultiIndex.from_tuples` to create the multi-index, 
# and then reassign it back to `df.columns`. 

def column_multi_name(column_name):
    if column_name in general_columns:
        return ('general', column_name)
    else:
        first, rest = column_name.rsplit('.', 1)
        return (first, rest)
    
py_df.columns = pd.MultiIndex.from_tuples([column_multi_name(one_column_name)
                  for one_column_name in py_df.columns    ])

In [5]:
# Sort the columns, such that they're in alphabetical order. 

py_df = py_df[sorted(py_df.columns)]

In [6]:
py_df[('ide', 'main')].value_counts().head(10)

(ide, main)
VS Code                         8010
PyCharm Professional Edition    5144
PyCharm Community Edition       3815
Vim                             2176
Sublime Text                    1201
Jupyter Notebook                1167
Atom                             784
Other                            711
Emacs                            636
Spyder                           580
Name: count, dtype: int64

In [7]:
# Which 10 other programming languages are most commonly used by Python developers?

(
    py_df['other.lang']
    .count()
    .sort_values(ascending=False)
    .head(10)
)

JavaScript      16662
HTML/CSS        15469
Bash / Shell    13793
SQL             13391
C/C++           11623
Java             8109
C#               4460
PHP              4060
TypeScript       3717
Other            3592
dtype: int64

In [8]:
# What were the 10 most common countries from which survey participants came?
(
    py_df[('general', 'country.live')]
    .value_counts()
    .head(10)
)

(general, country.live)
United States         3975
India                 2800
Germany               1807
China                 1155
United Kingdom        1110
France                1078
Russian Federation     935
Other country          880
Brazil                 812
Canada                 644
Name: count, dtype: int64

In [9]:
(
    py_df[('general', 'python.years')]
    .value_counts(normalize=True)
)

(general, python.years)
3–5 years           0.284272
Less than 1 year    0.239542
1–2 years           0.224834
6–10 years          0.154939
11+ years           0.096413
Name: proportion, dtype: float64

In [10]:
# Which country has the greatest number of Python developers with 11+ years of experience?
(
    py_df['general']
    [py_df[('general','python.years')] == '11+ years']
    .groupby('country.live')['python.years']
    .count()
    .sort_values(ascending=False)
    .head(1)
)

country.live
United States    691
Name: python.years, dtype: int64

In [11]:
# Proportion of Python devs per country with 11+years experience

country_experience = py_df['general'][['country.live', 'python.years']]
all_per_country = country_experience['country.live'].value_counts() 

In [12]:
expert_per_country = (
    country_experience.loc[
        country_experience['python.years'] == '11+ years',
        'country.live'
    ]
    .value_counts()
)

country.live
United States         3975
India                 2800
Germany               1807
China                 1155
United Kingdom        1110
                      ... 
Kazakhstan              36
Dominican Republic      34
Uruguay                 33
Armenia                 31
Uzbekistan              31
Name: count, Length: 76, dtype: int64

In [13]:
(expert_per_country / all_per_country).sort_values(ascending=False).dropna().head(10)


country.live
Norway            0.265432
Ireland           0.225490
Australia         0.225420
Belgium           0.225108
Slovenia          0.224490
New Zealand       0.197917
Sweden            0.194030
Finland           0.190141
United Kingdom    0.186486
Austria           0.186170
Name: count, dtype: float64

In [14]:
# Load the CSV file with results from the Stack Overflow survey into a data frame.
so_filename = '../data/so_2021_survey_results.csv'

so_df = pd.read_csv(so_filename, low_memory=False)

In [16]:
so_df.head(3)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",,...,18-24 years old,Man,No,Prefer not to say,Prefer not to say,None of the above,None of the above,Appropriate in length,Easy,


In [17]:
# Show the average salary for different types of employment. 
# Contractors and freelancers like to say that they earn more than full-time employees. 
# What does the data here show us?

(
    so_df
    .groupby('Employment')['ConvertedCompYearly'].mean()
    .sort_values(ascending=False)
    .dropna()
    .apply(lambda n: f'{n:,.2f}')
)

Employment
I prefer not to say                                     1,455,643.25
Employed full-time                                        121,369.67
Independent contractor, freelancer, or self-employed      107,433.97
Retired                                                    69,533.25
Employed part-time                                         41,136.12
Name: ConvertedCompYearly, dtype: object

In [18]:
# Create a pivot table in which the index contains countries, 
# the columns are education levels, 
# and the cells contain the average salary for each education level per country.

pd.options.display.float_format = '{:,.2f}'.format
(
    so_df # 原始数据
    .pivot_table(index='Country',   # 行索引：按国家分组
                 columns='EdLevel',  # 列索引：按教育水平分组
                 values='ConvertedCompYearly') # 值：转换后的年薪
)

EdLevel,"Associate degree (A.A., A.S., etc.)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Other doctoral degree (Ph.D., Ed.D., etc.)",Primary/elementary school,"Professional degree (JD, MD, etc.)","Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",Some college/university study without earning a degree,Something else
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,,30288.00,10176704.00,,,,100.00,,
Albania,,19152.86,80127.62,,,5298.00,19890.00,22884.00,128522.00
Algeria,,21770.67,15052.57,,,12912.00,,6288.00,
Andorra,,94045.50,22056.00,146981.00,,,,,
Angola,,31500.00,,,,,18678.00,6904.00,
...,...,...,...,...,...,...,...,...,...
"Venezuela, Bolivarian Republic of...",,30108.77,28680.00,,7200.00,14833.29,10200.00,17720.57,
Viet Nam,7827.00,18463.11,50599.80,2592.00,10479.00,30000.00,,18866.19,
Yemen,,5628.67,,,,,,,
Zambia,,40173.00,4908.00,,,,4482.00,12105.33,8184.00


用 pandas 的 pivot_table 创建一个透视表，并设置数字的显示格式为 带千位分隔符的两位小数（如 1,234.56

- 设置数字显示格式: `pd.options.display.float_format = '{:,.2f}'.format`

In [19]:

# Load the CSV file with OECD data
oecd_filename = '../data/oecd_locations.csv'

oecd_df = pd.read_csv(oecd_filename, header=None, index_col=1, names=['abbrev', 'Country'])

In [21]:
# Create this pivot table again, only including countries in our OECD subset.
(
    oecd_df
    .join(so_df.set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')
)

EdLevel,"Associate degree (A.A., A.S., etc.)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Other doctoral degree (Ph.D., Ed.D., etc.)",Primary/elementary school,"Professional degree (JD, MD, etc.)","Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",Some college/university study without earning a degree,Something else
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,117049.64,180794.16,106794.32,150234.96,153327.5,108725.0,158931.28,127063.53,231987.17
Austria,43623.38,66096.13,77645.66,74783.17,86877.0,38915.33,47438.83,53906.05,45772.53
Belgium,35664.0,68474.7,88580.41,80832.44,11342349.25,71000.2,43224.13,88006.29,27035.5
Brazil,25347.42,47681.23,42056.01,43123.21,7880.0,25449.67,15072.89,39978.02,20288.71
Canada,87930.35,140668.24,144733.35,102989.35,73787.83,82953.4,180584.81,155090.08,60795.17
Denmark,80217.33,80430.94,115751.85,102785.19,91056.57,114144.89,90411.74,85131.35,95479.86
Finland,282353.67,69381.34,74360.98,61508.25,77832.0,83016.0,59424.32,89255.56,35024.0
France,54394.89,65062.38,94585.73,140402.86,34181.0,78342.27,53981.81,55119.45,42884.04
Germany,98530.52,102751.23,110611.98,108718.46,74943.67,97330.14,82865.84,95737.77,109333.99
Hungary,51041.0,48756.45,46771.71,52833.6,48100.0,9852.0,33440.8,37102.89,28911.0


In [22]:
# In which of these countries does someone with an associate degree earn the most? 
(
    oecd_df
    .join(so_df
          .set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')['Associate degree (A.A., A.S., etc.)']
    .sort_values(ascending=False)
)

Country
Finland     282,353.67
Israel      146,420.90
Japan       143,196.83
Australia   117,049.64
Germany      98,530.52
Canada       87,930.35
Denmark      80,217.33
France       54,394.89
Hungary      51,041.00
Austria      43,623.38
Italy        36,427.94
Belgium      35,664.00
Brazil       25,347.42
Name: Associate degree (A.A., A.S., etc.), dtype: float64

In [23]:
# In which of them does someone with a doctoral degree earn the most?
# In which of these countries does someone with an associate degree earn the most? 
(
    oecd_df
    .join(so_df
          .set_index('Country'))
    .pivot_table(index='Country',
                 columns='EdLevel', 
                 values='ConvertedCompYearly')['Other doctoral degree (Ph.D., Ed.D., etc.)']
    .sort_values(ascending=False)
)

Country
Japan       157,239.40
Australia   150,234.96
France      140,402.86
Israel      131,812.62
Germany     108,718.46
Canada      102,989.35
Denmark     102,785.19
Italy        93,490.78
Belgium      80,832.44
Austria      74,783.17
Finland      61,508.25
Hungary      52,833.60
Brazil       43,123.21
Name: Other doctoral degree (Ph.D., Ed.D., etc.), dtype: float64

In [24]:
# Remove rows from `so_df` in which `LanguageHaveWorkedWith` is `NaN`.
so_df = (so_df
         .dropna(subset=['LanguageHaveWorkedWith'])
        )

In [25]:
# Remove rows from `so_df` in which Python isn't included 
# as a commonly used language (`LanguageHaveWorkedWith`).
so_df = (
    so_df
    .loc[so_df['LanguageHaveWorkedWith']
    .str.contains('Python')]
)

In [26]:
# Remove rows from `so_df` in which `YearsCode` is `NaN`.
so_df = (so_df
         .dropna(subset=['YearsCode'])
        )

In [28]:
so_df.loc[so_df['YearsCode'] == 'Less than 1 year', 'YearsCode'] = 0
so_df.loc[so_df['YearsCode'] == 'More than 50 years', 'YearsCode'] = 51

In [29]:
so_df['YearsCode'] = so_df['YearsCode'].astype(int)


In [30]:
# Create a new column in `so_df`, called `experience`, which will categorize the values in `YearsCode`

so_df['experience'] = pd.cut(so_df['YearsCode'],
       bins=[-1, 1, 2, 5, 10, 100],
      labels=['Less than 1 year', 
      '1-2 years', 
      '3-5 years',
      '6-10 years',
      '11+ years'])

In [31]:

# According to the Python survey, what proportion of Python developers have each level of experience?
so_df['experience'].value_counts(normalize=True)

experience
11+ years          0.37
6-10 years         0.32
3-5 years          0.22
1-2 years          0.05
Less than 1 year   0.04
Name: proportion, dtype: float64