## DataFrame and Series Basics - Selecting Rows and Columns

In [1]:
import pandas as pd
import numpy as np

以下為 Python dictionary 各種寫法

In [2]:
person = {"first": "Corey",
          "last": "Schafer",
          "email": "CoreySchafer@gmail.com"}

In [3]:
people = {"first": ["Corey"],
          "last": ["Schafer"],
          "email": ["CoreySchafer@gmail.com"]}

In [35]:
people = {"first": ["Corey", "Jane", "John"],
          "last": ["Schafer", "Doe", "Dog"],
          "email": ["CoreySchafer@gmail.com", "JaneDoe@gmail.com", "JohnDog@gmail.com"]}

In [36]:
people['email']

['CoreySchafer@gmail.com', 'JaneDoe@gmail.com', 'JohnDog@gmail.com']

In [37]:
df = pd.DataFrame(people) # pandas 可以直接讀取 dictionary 格式

In [38]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Dog,JohnDog@gmail.com


In [39]:
df["email"]

0    CoreySchafer@gmail.com
1         JaneDoe@gmail.com
2         JohnDog@gmail.com
Name: email, dtype: object

In [40]:
type(df["email"])

pandas.core.series.Series

In [41]:
df.email # 效果和 df["email"] 相同

0    CoreySchafer@gmail.com
1         JaneDoe@gmail.com
2         JohnDog@gmail.com
Name: email, dtype: object

In [42]:
type(df.email)

pandas.core.series.Series

In [43]:
df[['last','email']]

Unnamed: 0,last,email
0,Schafer,CoreySchafer@gmail.com
1,Doe,JaneDoe@gmail.com
2,Dog,JohnDog@gmail.com


In [44]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [45]:
df.iloc[0] #每項的第一筆

first                     Corey
last                    Schafer
email    CoreySchafer@gmail.com
Name: 0, dtype: object

In [47]:
df.iloc[[0, 2], [2, 1]] # 指定行列，調換行列等

Unnamed: 0,email,last
0,CoreySchafer@gmail.com,Schafer
2,JohnDog@gmail.com,Dog


In [50]:
df.loc[[0, 1], ['email', 'last']] # iloc 設定類項不能取用項目名稱 只能以陣列指標表示位置 , loc 設定類項則只能使用項目名稱表示位置

Unnamed: 0,email,last
0,CoreySchafer@gmail.com,Schafer
1,JaneDoe@gmail.com,Doe


In [52]:
public_df = pd.read_csv('../pandas_dataset/developer_survey_2020/survey_results_public.csv')
schema_df = pd.read_csv('../pandas_dataset/developer_survey_2020/survey_results_schema.csv')

In [53]:
pd.set_option('display.max_columns', 85) #設定顯示 column 數量
pd.set_option('display.max_rows', 85) #設定顯示 row 數量

In [54]:
public_df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [55]:
public_df['Hobbyist']

0        Yes
1         No
2        Yes
3        Yes
4        Yes
        ... 
64456    Yes
64457    Yes
64458    Yes
64459    Yes
64460    Yes
Name: Hobbyist, Length: 64461, dtype: object

In [56]:
public_df['Hobbyist'].value_counts() #自動統計類別內的資訊(不會計數 null 元素)

Yes    50388
No     14028
Name: Hobbyist, dtype: int64

In [57]:
public_df['NEWOffTopic'].value_counts()

Not sure    20213
No          18528
Yes         12063
Name: NEWOffTopic, dtype: int64

In [58]:
public_df.loc[[0, 1, 2], 'Hobbyist'] # 取 'Hobbyist' 的第一第二第三項

0    Yes
1     No
2    Yes
Name: Hobbyist, dtype: object

In [59]:
public_df.loc[0:2, 'Hobbyist':'Age1stCode'] # 這裡pandas與Python list的操作不一樣 list[0:2]只會取 0 1 項 列項目操作亦同

Unnamed: 0,Hobbyist,Age,Age1stCode
0,Yes,,13
1,No,,19
2,Yes,,15
