## Filtering - Using Conditionals to Filter Rows and Columns

In [1]:
import pandas as pd
import numpy as np

以下為 Python dictionary 各種寫法

In [2]:
person = {"first": "Corey",
          "last": "Schafer",
          "email": "CoreySchafer@gmail.com"}

In [3]:
people = {"first": ["Corey"],
          "last": ["Schafer"],
          "email": ["CoreySchafer@gmail.com"]}

In [4]:
people = {"first": ["Corey", "Jane", "John"],
          "last": ["Schafer", "Doe", "Doe"],
          "email": ["CoreySchafer@gmail.com", "JaneDoe@gmail.com", "JohnDoe@gmail.com"]}

In [5]:
people['email']

['CoreySchafer@gmail.com', 'JaneDoe@gmail.com', 'JohnDoe@gmail.com']

In [7]:
df = pd.DataFrame(people) # pandas 可以直接讀取 dictionary 格式

In [8]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [9]:
df['last'] == 'Doe' # 利用布靈運算設定搜尋條件 可以得到搜尋結果會以行索引表示的 True/False

0    False
1     True
2     True
Name: last, dtype: bool

In [10]:
filt = (df['last'] == 'Doe') # 將以上條件儲存成一個變數

In [11]:
df[filt] # 將此變數代回原資料做索引條件可以得到符合條件(True)的整行

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [12]:
df.loc[filt] # 用 loc 搜尋亦可得相同結果

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [17]:
df.loc[filt, 'email'] # loc 優點還能一起做列索引

1    JaneDoe@gmail.com
2    JohnDoe@gmail.com
Name: email, dtype: object

In [18]:
filt = (df['last'] == 'Doe') & (df['first'] == 'Jane') # 搜尋條件亦可有邏輯運算 (註) pandas 中邏輯運算(&, |, ~)與 Python 不一樣須注意
                                                       # &代表和

In [19]:
df.loc[filt, 'email']

1    JaneDoe@gmail.com
Name: email, dtype: object

In [20]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'Jane') # |代表或

In [21]:
df.loc[filt, 'email']

0    CoreySchafer@gmail.com
1         JaneDoe@gmail.com
Name: email, dtype: object

In [22]:
df.loc[~filt, 'email'] # ~ 代表反向邏輯

2    JohnDoe@gmail.com
Name: email, dtype: object

In [23]:
public_df = pd.read_csv('../pandas_dataset/developer_survey_2020/survey_results_public.csv')
schema_df = pd.read_csv('../pandas_dataset/developer_survey_2020/survey_results_schema.csv')

In [24]:
pd.set_option('display.max_columns', 85) #設定顯示 column 數量
pd.set_option('display.max_rows', 85) #設定顯示 row 數量

In [25]:
public_df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment,Ethnicity,Gender,JobFactors,JobSat,JobSeek,LanguageDesireNextYear,LanguageWorkedWith,MiscTechDesireNextYear,MiscTechWorkedWith,NEWCollabToolsDesireNextYear,NEWCollabToolsWorkedWith,NEWDevOps,NEWDevOpsImpt,NEWEdImpt,NEWJobHunt,NEWJobHuntResearch,NEWLearn,NEWOffTopic,NEWOnboardGood,NEWOtherComms,NEWOvertime,NEWPurchaseResearch,NEWPurpleLink,NEWSOSites,NEWStuck,OpSys,OrgSize,PlatformDesireNextYear,PlatformWorkedWith,PurchaseWhat,Sexuality,SOAccount,SOComm,SOPartFreq,SOVisitFreq,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...",White or of European descent,Man,"Languages, frameworks, and other technologies ...",Slightly satisfied,I am not interested in new job opportunities,C#;HTML/CSS;JavaScript,C#;HTML/CSS;JavaScript,.NET Core;Xamarin,.NET;.NET Core,Microsoft Teams;Microsoft Azure;Trello,Confluence;Jira;Slack;Microsoft Azure;Trello,No,Somewhat important,Fairly important,,,Once a year,Not sure,,No,Often: 1-2 days per week or more,Start a free trial;Ask developers I know/work ...,Amused,Stack Overflow (public Q&A for anyone who codes),Visit Stack Overflow;Go for a walk or other ph...,Windows,2 to 9 employees,Android;iOS;Kubernetes;Microsoft Azure;Windows,Windows,,Straight / Heterosexual,No,"No, not at all",,Multiple times per day,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,,,,Very dissatisfied,I am not interested in new job opportunities,Python;Swift,JavaScript;Swift,React Native;TensorFlow;Unity 3D,React Native,Github;Slack,Confluence;Jira;Github;Gitlab;Slack,,,Fairly important,,,Once a year,Not sure,,No,,,Amused,Stack Overflow (public Q&A for anyone who code...,Visit Stack Overflow;Go for a walk or other ph...,MacOS,"1,000 to 4,999 employees",iOS;Kubernetes;Linux;MacOS,iOS,I have little or no influence,,Yes,"Yes, definitely",Less than once per month or monthly,Multiple times per day,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,,,,,,,,,,,,Objective-C;Python;Swift,Objective-C;Python;Swift,,,,,,,,,,Once a decade,,,No,,,,Stack Overflow (public Q&A for anyone who codes),,Linux-based,,,,,,Yes,"Yes, somewhat",A few times per month or weekly,Daily or almost daily,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,ALL,,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",,White or of European descent,Man,Flex time or a flexible schedule;Office enviro...,Slightly dissatisfied,"I’m not actively looking, but I am open to new...",,,,,,,No,,Not at all important/not necessary,Curious about other opportunities;Wanting to w...,,Once a year,Not sure,Yes,Yes,Occasionally: 1-2 days per quarter but less th...,,,Stack Overflow (public Q&A for anyone who code...,,Linux-based,20 to 99 employees,,,I have a great deal of influence,Straight / Heterosexual,Yes,"Yes, definitely",A few times per month or weekly,Multiple times per day,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,,MySQL;PostgreSQL,MySQL;PostgreSQL;Redis;SQLite,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,White or of European descent,Man,,,,Java;Ruby;Scala,HTML/CSS;Ruby;SQL,Ansible;Chef,Ansible,"Github;Google Suite (Docs, Meet, etc)",Confluence;Jira;Github;Slack;Google Suite (Doc...,,,Very important,,,Once a year,No,,Yes,,Start a free trial;Ask developers I know/work ...,"Hello, old friend",Stack Overflow (public Q&A for anyone who code...,Call a coworker or friend;Visit Stack Overflow...,Windows,,Docker;Google Cloud Platform;Heroku;Linux;Windows,AWS;Docker;Linux;MacOS;Windows,,Straight / Heterosexual,Yes,"Yes, somewhat",Less than once per month or monthly,A few times per month or weekly,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [28]:
high_salary = (public_df['ConvertedComp'] > 70000) # 如同前述的條件式

In [27]:
public_df.loc[high_salary, ['Country', 'LanguageWorkedWith', 'ConvertedComp']] # loc 可以同時輸入條件式以及多列項目搜尋

Unnamed: 0,Country,LanguageWorkedWith,ConvertedComp
7,United States,Python;SQL,116000.0
15,United Kingdom,Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...,108576.0
16,United States,C#;HTML/CSS;JavaScript;Python;SQL;VBA,79000.0
17,United States,Bash/Shell/PowerShell;HTML/CSS;Perl,1260000.0
18,United States,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;S...,83400.0
...,...,...,...
64113,United States,,225000.0
64116,United States,,150000.0
64127,United States,,140000.0
64129,United States,,150000.0


In [31]:
countries = ['United States', 'United Kingdom', 'India', 'Germany', 'Canada']

In [32]:
filt = public_df['Country'].isin(countries) # 條件式另一種呈現方式(特定列項目陣列值搜尋)

In [33]:
public_df.loc[filt, ['Country', 'LanguageWorkedWith']]

Unnamed: 0,Country,LanguageWorkedWith
0,Germany,C#;HTML/CSS;JavaScript
1,United Kingdom,JavaScript;Swift
4,United States,HTML/CSS;Ruby;SQL
5,Germany,HTML/CSS;Java;JavaScript
6,India,C#;HTML/CSS;PHP
...,...,...
64441,Canada,C;C#;C++;Java
64442,India,SQL
64443,United States,C++;HTML/CSS;Java;JavaScript;Python;SQL
64452,India,


In [39]:
filt = public_df['LanguageWorkedWith'].str.contains('Python') # 條件式另一種呈現方式(特定列項目字串搜尋) (特別注意 : 若沒有加入 na = False 在後面 loc 時會有錯誤 => Cannot mask with non-boolean array containing NA / NaN values)

In [40]:
filt # 承上述因為 loc 接受條件式的結果只有 True/False 所以有 NaN 的資料必須再處理

0        False
1        False
2         True
3          NaN
4        False
         ...  
64456      NaN
64457     True
64458      NaN
64459    False
64460    False
Name: LanguageWorkedWith, Length: 64461, dtype: object

In [41]:
python = ["Python"]
filt = public_df['LanguageWorkedWith'].isin(python)

In [42]:
filt

0        False
1        False
2        False
3        False
4        False
         ...  
64456    False
64457    False
64458    False
64459    False
64460    False
Name: LanguageWorkedWith, Length: 64461, dtype: bool

In [43]:
filt = public_df['LanguageWorkedWith'].str.contains('Python', na=False) # 正確對於資料中有 NaN 的寫法 (NaN => False)

In [44]:
filt

0        False
1        False
2         True
3        False
4        False
         ...  
64456    False
64457     True
64458    False
64459    False
64460    False
Name: LanguageWorkedWith, Length: 64461, dtype: bool

In [45]:
public_df.loc[filt, 'LanguageWorkedWith']

2                                 Objective-C;Python;Swift
7                                               Python;SQL
9                      HTML/CSS;Java;JavaScript;Python;SQL
12                                     C;JavaScript;Python
14        Bash/Shell/PowerShell;C;HTML/CSS;Java;Python;SQL
                               ...                        
64433    Bash/Shell/PowerShell;HTML/CSS;JavaScript;Perl...
64438       C++;HTML/CSS;JavaScript;Python;Ruby;TypeScript
64443              C++;HTML/CSS;Java;JavaScript;Python;SQL
64446    Bash/Shell/PowerShell;C;C#;C++;HTML/CSS;Java;J...
64457    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
Name: LanguageWorkedWith, Length: 25287, dtype: object