# Setup

In [1]:
import pandas as pd
import numpy as np

import requests
from io import StringIO

import string

import utils_midproject as utils

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

%load_ext autoreload
%autoreload 2

# 01 Python survey

## 01 Loading the data

In [2]:
file_name = 'data/2020_sharing_data_outside.csv'

py_df = pd.read_csv(file_name, low_memory=False)

In [3]:
py_df.head()

Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,...,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,...,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,...,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,...,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,...,,,,,,,,,21–29,Italy


In [5]:
py_df.shape

(54462, 264)

## 02 Turn the columns into a multi-index

In [11]:
miu = utils.MultiIndexUtils(df=py_df)
df_multi = miu.with_multi_index_columns()

In [12]:
df_multi.head()

Unnamed: 0_level_0,general,other.lang,other.lang,other.lang,other.lang,other.lang,other.lang,other.lang,other.lang,other.lang,...,job.role,job.role,job.role,job.role,job.role,job.role,job.role,job.role,general,general
Unnamed: 0_level_1,is.python.main,None,Java,JavaScript,C/C++,PHP,C#,Ruby,Bash / Shell,Objective-C,...,Technical support,Data analyst,Business analyst,Team lead,Product manager,CIO / CEO / CTO,Systems analyst,Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,...,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,...,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,...,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,...,,,,,,,,,21–29,Italy


In [13]:
py_df.shape, df_multi.shape

((54462, 264), (54462, 264))

## 03 Sort the columns so they’re in alphabetical order

In [14]:
# SOrted columns
df_multi = df_multi.reindex(sorted(df_multi.columns), axis=1)

In [15]:
df_multi.head()

Unnamed: 0_level_0,bigdata,bigdata,bigdata,bigdata,bigdata,bigdata,bigdata,bigdata,bigdata,bigdata,...,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks,web.frameworks
Unnamed: 0_level_1,Apache Beam,Apache Flink,Apache Hadoop/MapReduce,Apache Hive,Apache Kafka,Apache Samza,Apache Spark,Apache Tez,ClickHouse,Dask,...,Django,Falcon,FastAPI,Flask,Hug,None,Other,Pyramid,Tornado,web2py
0,Apache Beam,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,Tornado,
2,,,,,,,,,,,...,,,,Flask,,,,,,
3,,,,,,,,,,,...,Django,,,,,,,,,
4,,,,,,,,,,,...,,,,Flask,,,,,,


## 04 Answer these questions:

### What are the 10 most popular Python IDEs?

In [23]:
for first, second in df_multi.columns:
    if 'ide' in first.lower():
        print(f'{first} - {second}')

ide - main
ide.editor - Atom
ide.editor - Eclipse + Pydev
ide.editor - Emacs
ide.editor - IDLE
ide.editor - IntelliJ IDEA
ide.editor - Jupyter Notebook
ide.editor - JupyterLab
ide.editor - None
ide.editor - NotePad++
ide.editor - Other
ide.editor - PyCharm Community Edition
ide.editor - PyCharm Professional Edition
ide.editor - Python Tools for Visual Studio (PTVS)
ide.editor - Spyder
ide.editor - Sublime Text
ide.editor - VS Code
ide.editor - Vim


In [25]:
df_multi[('ide', 'main')]

0           PyCharm Community Edition
1                             VS Code
2                                 Vim
3        PyCharm Professional Edition
4                             VS Code
                     ...             
54457                             Vim
54458                             NaN
54459    PyCharm Professional Edition
54460                           Other
54461                         VS Code
Name: (ide, main), Length: 54462, dtype: object

In [26]:
df_multi[('ide.editor', 'VS Code')]

0            NaN
1            NaN
2        VS Code
3            NaN
4            NaN
          ...   
54457        NaN
54458        NaN
54459    VS Code
54460        NaN
54461        NaN
Name: (ide.editor, VS Code), Length: 54462, dtype: object

In [27]:
df_multi[('ide', 'main')].value_counts().head(10)

(ide, main)
VS Code                         8010
PyCharm Professional Edition    5144
PyCharm Community Edition       3815
Vim                             2176
Sublime Text                    1201
Jupyter Notebook                1167
Atom                             784
Other                            711
Emacs                            636
Spyder                           580
Name: count, dtype: int64

### Which 10 other programming languages (`other.lang`) are most commonly used by Python developers?

In [29]:
for first, second in df_multi.columns:
    if 'other.lang' in first.lower():
        print(f'{first} - {second}')

other.lang - Bash / Shell
other.lang - C#
other.lang - C/C++
other.lang - Clojure
other.lang - CoffeeScript
other.lang - Go
other.lang - Groovy
other.lang - HTML/CSS
other.lang - Java
other.lang - JavaScript
other.lang - Kotlin
other.lang - None
other.lang - Objective-C
other.lang - Other
other.lang - PHP
other.lang - Perl
other.lang - R
other.lang - Ruby
other.lang - Rust
other.lang - SQL
other.lang - Scala
other.lang - Swift
other.lang - TypeScript
other.lang - Visual Basic


In [32]:
df_multi['other.lang'].count().sort_values(ascending=False).head(10)

JavaScript      16662
HTML/CSS        15469
Bash / Shell    13793
SQL             13391
C/C++           11623
Java             8109
C#               4460
PHP              4060
TypeScript       3717
Other            3592
dtype: int64

### What were the 10 most common countries from which survey participants came?

In [34]:
df_multi[('general', 'country.live')].value_counts().head(10)

(general, country.live)
United States         3975
India                 2800
Germany               1807
China                 1155
United Kingdom        1110
France                1078
Russian Federation     935
Other country          880
Brazil                 812
Canada                 644
Name: count, dtype: int64

### According to the Python survey, what proportion of Python developers have each level of experience?

In [35]:
df_multi[('general', 'python.years')].value_counts().head(10)

(general, python.years)
3–5 years           10302
Less than 1 year     8681
1–2 years            8148
6–10 years           5615
11+ years            3494
Name: count, dtype: int64

### Which country has the greatest number of Python developers with 11+ years of experience?

In [37]:
# Which country has the greatest number of Python developers with 11+ years of experience?
mask = df_multi[('general', 'python.years')] == '11+ years'
df_multi[mask][('general', 'country.live')].value_counts().head(10)

(general, country.live)
United States     691
Germany           308
United Kingdom    207
France            166
Australia          94
Canada             94
Netherlands        78
Italy              71
Spain              67
Other country      63
Name: count, dtype: int64

### Which country has the greatest proportion of Python developers with 11+ years of experience?

In [38]:
# Which country has the greatest proportion of Python developers with 11+ years of experience?
country_experience = df_multi['general'][['country.live', 'python.years']]

In [39]:
country_experience

Unnamed: 0,country.live,python.years
0,,3–5 years
1,India,3–5 years
2,United States,3–5 years
3,,11+ years
4,Italy,1–2 years
...,...,...
54457,Russian Federation,6–10 years
54458,,3–5 years
54459,Russian Federation,3–5 years
54460,Spain,6–10 years


In [40]:
all_per_country = country_experience['country.live'].value_counts() 

In [41]:
all_per_country

country.live
United States         3975
India                 2800
Germany               1807
China                 1155
United Kingdom        1110
                      ... 
Kazakhstan              36
Dominican Republic      34
Uruguay                 33
Armenia                 31
Uzbekistan              31
Name: count, Length: 76, dtype: int64

In [42]:
expert_per_country = country_experience[mask]['country.live'].value_counts()

In [43]:
expert_per_country

country.live
United States     691
Germany           308
United Kingdom    207
France            166
Australia          94
                 ... 
Viet Nam            1
Tunisia             1
Pakistan            1
Bangladesh          1
Egypt               1
Name: count, Length: 70, dtype: int64

In [44]:
expert_per_country / all_per_country

country.live
Algeria          0.068966
Argentina        0.092308
Armenia               NaN
Australia        0.225420
Austria          0.186170
                   ...   
United States    0.173836
Uruguay          0.060606
Uzbekistan            NaN
Venezuela        0.045455
Viet Nam         0.013158
Name: count, Length: 76, dtype: float64

# 02 Stack Overflow data

## 05 Load the Stack Overflow data

In [2]:
file_name_so = 'data/so_2021_survey_results.csv'
df_so = pd.read_csv(file_name_so, low_memory=False)

In [3]:
df_so.shape

(83439, 48)

In [4]:
df_so.head()

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Slovakia,,,"Secondary school (e.g. American high school, G...",18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,62268.0
1,2,I am a student who is learning to code,"Student, full-time",Netherlands,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",7.0,...,18-24 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,None of the above,Appropriate in length,Easy,
2,3,"I am not primarily a developer, but I write co...","Student, full-time",Russian Federation,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,"Other online resources (ex: videos, blogs, etc...",,...,18-24 years old,Man,No,Prefer not to say,Prefer not to say,None of the above,None of the above,Appropriate in length,Easy,
3,4,I am a developer by profession,Employed full-time,Austria,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,,,...,35-44 years old,Man,No,Straight / Heterosexual,White or of European descent,I am deaf / hard of hearing,,Appropriate in length,Neither easy nor difficult,
4,5,I am a developer by profession,"Independent contractor, freelancer, or self-em...",United Kingdom of Great Britain and Northern I...,,England,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5 - 10 years,Friend or family member,17.0,...,25-34 years old,Man,No,,White or of European descent,None of the above,,Appropriate in length,Easy,


## 06 Average salary for different types of employment

In [62]:
df_so['Employment'].value_counts()

Employment
Employed full-time                                      53584
Student, full-time                                      11781
Independent contractor, freelancer, or self-employed     8041
Not employed, but looking for work                       2961
Employed part-time                                       2461
Student, part-time                                       2051
Not employed, and not looking for work                   1228
I prefer not to say                                       890
Retired                                                   326
Name: count, dtype: int64

In [61]:
df_so['ConvertedCompYearly']

0         62268.0
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
83434    160500.0
83435      3960.0
83436     90000.0
83437    816816.0
83438     21168.0
Name: ConvertedCompYearly, Length: 83439, dtype: float64

In [None]:
# Contractors and
# freelancers like to say that they earn more than full-time employees. What does
# the data here show you?
salary_column = 'ConvertedCompYearly'
employment_column = 'Employment'
(
    df_so[salary_column]
        .dropna()
        .groupby(df_so[employment_column])
        .mean()
        .sort_values(ascending=False)
        .apply(lambda n: f'{n:,.0f}')
)

Employment
I prefer not to say                                     1,455,643
Employed full-time                                        121,370
Independent contractor, freelancer, or self-employed      107,434
Retired                                                    69,533
Employed part-time                                         41,136
Name: ConvertedCompYearly, dtype: object

## 07 Pivot table

In [5]:
df_so.columns

Index(['ResponseId', 'MainBranch', 'Employment', 'Country', 'US_State',
       'UK_Country', 'EdLevel', 'Age1stCode', 'LearnCode', 'YearsCode',
       'YearsCodePro', 'DevType', 'OrgSize', 'Currency', 'CompTotal',
       'CompFreq', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith',
       'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith',
       'PlatformHaveWorkedWith', 'PlatformWantToWorkWith',
       'WebframeHaveWorkedWith', 'WebframeWantToWorkWith',
       'MiscTechHaveWorkedWith', 'MiscTechWantToWorkWith',
       'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith',
       'NEWCollabToolsHaveWorkedWith', 'NEWCollabToolsWantToWorkWith', 'OpSys',
       'NEWStuck', 'NEWSOSites', 'SOVisitFreq', 'SOAccount', 'SOPartFreq',
       'SOComm', 'NEWOtherComms', 'Age', 'Gender', 'Trans', 'Sexuality',
       'Ethnicity', 'Accessibility', 'MentalHealth', 'SurveyLength',
       'SurveyEase', 'ConvertedCompYearly'],
      dtype='object')

In [8]:
# Create a pivot table in which the index contains countries, the columns are edu-
# cation levels, and the cells contain the average salary for each education level
# per country.
country_column = 'Country'
edlevel_column = 'EdLevel'
salary_column = 'ConvertedCompYearly'

pd.options.display.float_format = '{:,.0f}'.format

df_so.pivot_table(
    index=country_column,
    columns=edlevel_column,
    values=salary_column,
    aggfunc='mean'
)

EdLevel,"Associate degree (A.A., A.S., etc.)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Other doctoral degree (Ph.D., Ed.D., etc.)",Primary/elementary school,"Professional degree (JD, MD, etc.)","Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",Some college/university study without earning a degree,Something else
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,,30288,10176704,,,,100,,
Albania,,19153,80128,,,5298,19890,22884,128522
Algeria,,21771,15053,,,12912,,6288,
Andorra,,94046,22056,146981,,,,,
Angola,,31500,,,,,18678,6904,
...,...,...,...,...,...,...,...,...,...
"Venezuela, Bolivarian Republic of...",,30109,28680,,7200,14833,10200,17721,
Viet Nam,7827,18463,50600,2592,10479,30000,,18866,
Yemen,,5629,,,,,,,
Zambia,,40173,4908,,,,4482,12105,8184


## 08 Pivot table 2

In [33]:
oecd_filename = 'data/oecd_locations.csv'
df_oecd = pd.read_csv(oecd_filename, header=None, names=['country_code', 'country'])

In [34]:
df_oecd

Unnamed: 0,country_code,country
0,AUS,Australia
1,AUT,Austria
2,BEL,Belgium
3,CAN,Canada
4,DNK,Denmark
5,FIN,Finland
6,FRA,France
7,DEU,Germany
8,HUN,Hungary
9,ITA,Italy


In [21]:
df_oecd.shape

(16, 2)

In [22]:
df_so[country_column].nunique(), df_so[country_column].unique()[:10]

(181,
 array(['Slovakia', 'Netherlands', 'Russian Federation', 'Austria',
        'United Kingdom of Great Britain and Northern Ireland',
        'United States of America', 'Malaysia', 'India', 'Sweden', 'Spain'],
       dtype=object))

In [32]:
oecd = utils.OECDUtils(df_oecd=df_oecd, df_so=df_so)
oecd.oecd_countries_list, len(oecd.oecd_countries_list)

(['Australia',
  'Austria',
  'Belgium',
  'Canada',
  'Denmark',
  'Finland',
  'France',
  'Germany',
  'Hungary',
  'Italy',
  'Japan',
  'South Korea',
  'Republic of Korea',
  'United Kingdom of Great Britain and Northern Ireland',
  'United States of America',
  'Brazil',
  'Israel'],
 17)

In [36]:
# Create this pivot table again, only including countries in the OECD subset
country_column = 'Country'
edlevel_column = 'EdLevel'
salary_column = 'ConvertedCompYearly'

# Create a mask for OECD countries
mask_oecd = df_so[country_column].isin(oecd.oecd_countries_list)
# Create a new DataFrame with only OECD countries
df_so_oecd = df_so[mask_oecd]

pivot_table_oecd = df_so_oecd.pivot_table(
    index=country_column,
    columns=edlevel_column,
    values=salary_column,
    aggfunc='mean'
)

pivot_table_oecd.head()

EdLevel,"Associate degree (A.A., A.S., etc.)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Other doctoral degree (Ph.D., Ed.D., etc.)",Primary/elementary school,"Professional degree (JD, MD, etc.)","Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",Some college/university study without earning a degree,Something else
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,117050,180794,106794,150235,153328,108725,158931,127064,231987
Austria,43623,66096,77646,74783,86877,38915,47439,53906,45773
Belgium,35664,68475,88580,80832,11342349,71000,43224,88006,27036
Brazil,25347,47681,42056,43123,7880,25450,15073,39978,20289
Canada,87930,140668,144733,102989,73788,82953,180585,155090,60795


In [38]:
# In which of these countries does someone with an associate’s degree earn the most?
associate_degree_column = None
for column in df_so[edlevel_column]:
    if 'associate' in column.lower():
        print(f'Associate degree column: {column}')
        associate_degree_column = column
        break

if associate_degree_column:
    associate_degree_salaries = pivot_table_oecd[associate_degree_column].dropna()
    max_salary_country = associate_degree_salaries.idxmax()
    max_salary_value = associate_degree_salaries.max()
    print(f'Country with highest salary for associate degree: {max_salary_country} - {max_salary_value:,.0f}')
else:
    print('Associate degree column not found in the pivot table.')

Associate degree column: Associate degree (A.A., A.S., etc.)
Country with highest salary for associate degree: Finland - 282,354


In [39]:
# Extract the salaries for the associate degree column per country and sort them
pivot_table_oecd[associate_degree_column].dropna().sort_values(ascending=False).head(10)

Country
Finland                                                282,354
United States of America                               206,568
Israel                                                 146,421
Japan                                                  143,197
United Kingdom of Great Britain and Northern Ireland   139,155
Australia                                              117,050
Germany                                                 98,531
Canada                                                  87,930
Denmark                                                 80,217
France                                                  54,395
Name: Associate degree (A.A., A.S., etc.), dtype: float64

In [40]:
# In which of them does someone with a doctoral degree earn the most?
doctoral_degree_column = None
for column in df_so[edlevel_column]:
    if 'doctoral' in column.lower():
        print(f'Doctoral degree column: {column}')
        doctoral_degree_column = column
        break

doctoral_degree_column

Doctoral degree column: Other doctoral degree (Ph.D., Ed.D., etc.)


'Other doctoral degree (Ph.D., Ed.D., etc.)'

In [41]:
pivot_table_oecd[doctoral_degree_column].dropna().sort_values(ascending=False).head(10)

Country
United States of America                               208,657
Japan                                                  157,239
Australia                                              150,235
France                                                 140,403
Israel                                                 131,813
United Kingdom of Great Britain and Northern Ireland   123,226
Germany                                                108,718
Canada                                                 102,989
Denmark                                                102,785
Italy                                                   93,491
Name: Other doctoral degree (Ph.D., Ed.D., etc.), dtype: float64

## 09 Remove rows from `so_df` in which `LanguageHaveWorkedWith` is `NaN`

In [44]:
column_language = 'LanguageHaveWorkedWith'
column_language in df_so.columns

True

In [46]:
# Remove rows from `df_so` in which `LanguageHaveWorkedWith` is `NaN`
df_so_landuage_clean = df_so.dropna(subset=[column_language])

In [47]:
# Check that the column is clean
df_so_landuage_clean[column_language].isna().sum()

np.int64(0)

## 10 Remove rows from `df_so` in which `Python` isn’t included as a commonly used language (`LanguageHaveWorkedWith`)

In [48]:
column_language2 = 'LanguageHaveWorkedWith'
column_language2 in df_so.columns

True

In [50]:
# Check the values in the `LanguageHaveWorkedWith` column
df_so_landuage_clean[column_language2].value_counts().head(20)

LanguageHaveWorkedWith
Python                                    1337
HTML/CSS;JavaScript;Node.js;TypeScript     811
Java                                       715
HTML/CSS;JavaScript;PHP;SQL                674
C#                                         597
C#;HTML/CSS;JavaScript;SQL                 558
HTML/CSS;JavaScript                        531
HTML/CSS;JavaScript;Node.js                494
HTML/CSS;JavaScript;TypeScript             491
HTML/CSS;JavaScript;Python                 432
HTML/CSS;JavaScript;PHP                    417
Python;SQL                                 416
C#;HTML/CSS;JavaScript;SQL;TypeScript      405
C++;Python                                 348
C#;SQL                                     347
Java;Kotlin                                299
Java;Python                                290
HTML/CSS;JavaScript;Node.js;PHP;SQL        280
C++                                        277
JavaScript                                 276
Name: count, dtype: int64

In [51]:
# Remove rows from `df_so` in which `Python` isn’t included as a commonly used language (`LanguageHaveWorkedWith`)

# Drop nans from the `LanguageHaveWorkedWith` column    
df_so_language2_clean = df_so.dropna(subset=[column_language2])

# Create a mask for rows where 'Python' is mentioned in the `LanguageHaveWorkedWith` column
mask_python = df_so_language2_clean[column_language2].str.lower().str.contains('python')

# Filter the DataFrame using the mask
# and create a new DataFrame with only the rows where 'Python' is mentioned
df_so_python = df_so_language2_clean[mask_python]

In [54]:
df_so_python[column_language2].value_counts().head(20)

LanguageHaveWorkedWith
Python                                               1337
HTML/CSS;JavaScript;Python                            432
Python;SQL                                            416
C++;Python                                            348
Java;Python                                           290
Bash/Shell;Python                                     274
HTML/CSS;JavaScript;Node.js;Python                    235
C;C++;Python                                          226
HTML/CSS;JavaScript;Python;SQL                        223
HTML/CSS;Python                                       215
HTML/CSS;JavaScript;Node.js;Python;TypeScript         210
JavaScript;Python                                     200
Bash/Shell;Python;SQL                                 169
C#;Python                                             157
C;Python                                              157
Python;R                                              154
HTML/CSS;JavaScript;Node.js;Python;SQL;TypeScript

In [55]:
df_so_python.shape

(39792, 48)

## 11 Remove rows from `df_so` in which `YearsCode` is `NaN`

In [66]:
column_years_code = 'YearsCode'
column_years_code in df_so.columns

True

In [67]:
df_so_years_code_clean = df_so.dropna(subset=[column_years_code])

In [68]:
df_so.shape, df_so_years_code_clean.shape

((83439, 48), (81641, 48))

## 12 Replace the string value Less than 1 year in YearsCode with 0. Replace the string value More than 50 years with 51.

In [73]:
df_so_years_code_clean[column_years_code].sort_values(ascending=True).unique()

array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39',
       '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49',
       '5', '50', '6', '7', '8', '9', 'Less than 1 year',
       'More than 50 years'], dtype=object)

In [74]:
# Replace the string value `Less than 1 year`` in YearsCode with 0
column_less_than_1_year = 'Less than 1 year'
df_so_years_code_clean.loc[:, column_years_code] = df_so_years_code_clean.loc[:, column_years_code].replace(
    to_replace=column_less_than_1_year,
    value='0'
)

In [75]:
df_so_years_code_clean[column_years_code].sort_values(ascending=True).unique()

array(['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28',
       '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38',
       '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '5', '50', '6', '7', '8', '9', 'More than 50 years'],
      dtype=object)

In [76]:
# Replace the string value More than 50 years with 51.
column_more_than_50_years = 'More than 50 years'
df_so_years_code_clean.loc[:, column_years_code] = df_so_years_code_clean.loc[:, column_years_code].replace(
    to_replace=column_more_than_50_years,
    value='51'
)

In [77]:
df_so_years_code_clean[column_years_code].sort_values(ascending=True).unique()

array(['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28',
       '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38',
       '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '5', '50', '51', '6', '7', '8', '9'], dtype=object)

## 13 Turn YearsCode into an integer column.

In [79]:
# Turn YearsCode into an integer column.
df_so_years_code_clean.loc[:, column_years_code] = df_so_years_code_clean.loc[:, column_years_code].astype(int)

In [None]:
df_so_years_code_clean.loc[:, column_years_code] = df_so_years_code_clean.loc[:, column_years_code].astype(int)

In [82]:
df_so_years_code_clean.loc[:, column_years_code].dtype

dtype('int64')

## 14 Create a new column in `df_so`, called experience, which will categorize the values in the `YearsCode` 

In [83]:
# Values can be
# – Less than 1 year
# – 1–2 years
# – 3–5 years
# – 6–10 years
# – 11+ years

utils.categorize_experience(2)

'1–2 years'

In [85]:
# Create a new column in `df_so`, called experience, which will categorize the values in the `YearsCode`
column_experience = 'experience'
df_so_years_code_clean.loc[:, column_experience] = df_so_years_code_clean[column_years_code].apply(
    utils.categorize_experience
)

In [86]:
df_so_years_code_clean[column_experience].value_counts()

experience
11+ years           35132
6–10 years          24295
3–5 years           15688
1–2 years            5037
Less than 1 year     1489
Name: count, dtype: int64

## 15 According to the Stack Overflow survey, what proportion of Python developers have each level of experience?

In [88]:
pd.options.display.float_format = '{:,.4f}'.format
df_so_years_code_clean[column_experience].value_counts(normalize=True)

experience
11+ years          0.4303
6–10 years         0.2976
3–5 years          0.1922
1–2 years          0.0617
Less than 1 year   0.0182
Name: proportion, dtype: float64

This was the midway project...