## 대소문자 처리하기

### 개념
str 접근자를 사용하여 lower(), upper(), capitalize() 등의 함수를 활용해 대소문자 처리 가능

### 방법
1. 모든 문자 소문자로 통일 `upper()`
2. 모든 문자 대문자로 통일 `lower()`
3. 앞글자만 대문자, 나머지는 소문자 `caplitalize()`

### 주의사항
1. DataFrame의 컬럼에 변환된 데이터를 저장(할당)해야 실제로 값들이 변경된다.
2. pandas의 Series 형식은 위 세가지 문자처리함수를 메소드로 갖고 있지 않으므로, .str을 통해 시리즈 안의 값을 문자로 바꾼후 처리해야 한다.
    - str은 문자열 뿐만  아니라 리스트나 튜플처럼 순서가 있는 데이터 묶음에도 적용이 가능한 접근자!
    - 따라서 .str로 접근하고 인덱싱하면 분리된 데이터의 각 부분에 접근 가능해짐

In [None]:
import pandas as pd

# 에어비앤비 데이터 불러오기
airbnb_df = pd.read_csv('./data/airbnb.csv')

airbnb_df.head(3)

Unnamed: 0,id,state,guests,beds,property_type,rating,n_reviews,location,price
0,BF1406,Illinois,2,1,Apartment,96.0,55,"Avondale, Chicago.",48
1,DB7921,MASSACHUSETTS,4,1,Apartment,,0,"South End, Boston.",115
2,OZ5857,ILLINOIS,4,3,Apartment,100.0,51,"Avondale, Chicago.",150


In [None]:
airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713 entries, 0 to 712
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             713 non-null    object 
 1   state          713 non-null    object 
 2   guests         713 non-null    int64  
 3   beds           713 non-null    int64  
 4   property_type  713 non-null    object 
 5   rating         700 non-null    float64
 6   n_reviews      713 non-null    int64  
 7   location       713 non-null    object 
 8   price          713 non-null    int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 50.3+ KB


In [None]:
airbnb_df['state'].unique() # 문자열 형식이 통일성 없음을 확인

array(['Illinois', 'MASSACHUSETTS', 'ILLINOIS', 'california',
       'massachusetts', 'California', 'Massachusetts', 'CALIFORNIA',
       'illinois'], dtype=object)

In [None]:
airbnb_df['state'].str.upper()

0           ILLINOIS
1      MASSACHUSETTS
2           ILLINOIS
3         CALIFORNIA
4      MASSACHUSETTS
           ...      
708       CALIFORNIA
709       CALIFORNIA
710       CALIFORNIA
711         ILLINOIS
712    MASSACHUSETTS
Name: state, Length: 713, dtype: object

In [None]:
airbnb_df['state'].str.lower()

0           illinois
1      massachusetts
2           illinois
3         california
4      massachusetts
           ...      
708       california
709       california
710       california
711         illinois
712    massachusetts
Name: state, Length: 713, dtype: object

In [None]:
airbnb_df['state'].str.capitalize()

0           Illinois
1      Massachusetts
2           Illinois
3         California
4      Massachusetts
           ...      
708       California
709       California
710       California
711         Illinois
712    Massachusetts
Name: state, Length: 713, dtype: object

---

## 문자열 분리하기

### 개념
`split()` 함수는 문자열을 특정 문자 기준으로 나누어 리스트로 반환 / 이때 str접근자를 사용해 Series에 문자열 처리 적용

### 방법
`split()` 사용해서 괄호안에 분리 기준이 되는 string 입력

In [None]:
import pandas as pd

# 에어비앤비 데이터 불러오기
airbnb_df = pd.read_csv('./data/airbnb.csv', index_col=0)

airbnb_df.head(3)

Unnamed: 0_level_0,state,guests,beds,property_type,rating,n_reviews,location,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BF1406,Illinois,2,1,Apartment,96.0,55,"Avondale, Chicago.",48
DB7921,MASSACHUSETTS,4,1,Apartment,,0,"South End, Boston.",115
OZ5857,ILLINOIS,4,3,Apartment,100.0,51,"Avondale, Chicago.",150


In [None]:
# 동네명, 도시(주)이름 분리
airbnb_df['location'].str.split(',')

id
BF1406             [Avondale,  Chicago.]
DB7921             [South End,  Boston.]
OZ5857             [Avondale,  Chicago.]
HY8797           [SoMa,  San Francisco.]
VN7858          [West Roxbury,  Boston.]
                       ...              
XG3289           [Venice,  Los Angeles.]
ZR5298     [Mid-Wilshire,  Los Angeles.]
XZ5531    [Redondo Beach,  Los Angeles.]
EE8333             [Lakeview,  Chicago.]
AA6024           [East Boston,  Boston.]
Name: location, Length: 713, dtype: object

In [None]:
# neighborhood 컬럼 만들어서 분리한 데이터 저장
airbnb_df['neighborhood'] = airbnb_df['location'].str.split(',').str[0]
airbnb_df[['neighborhood']]

Unnamed: 0_level_0,neighborhood
id,Unnamed: 1_level_1
BF1406,Avondale
DB7921,South End
OZ5857,Avondale
HY8797,SoMa
VN7858,West Roxbury
...,...
XG3289,Venice
ZR5298,Mid-Wilshire
XZ5531,Redondo Beach
EE8333,Lakeview


In [None]:
airbnb_df['city'] = airbnb_df['location'].str.split(',').str[1].str[:-1]
airbnb_df[['city']]

Unnamed: 0_level_0,city
id,Unnamed: 1_level_1
BF1406,Chicago
DB7921,Boston
OZ5857,Chicago
HY8797,San Francisco
VN7858,Boston
...,...
XG3289,Los Angeles
ZR5298,Los Angeles
XZ5531,Los Angeles
EE8333,Chicago


In [None]:
airbnb_df.head(3)

Unnamed: 0_level_0,state,guests,beds,property_type,rating,n_reviews,location,price,neighborhood,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BF1406,Illinois,2,1,Apartment,96.0,55,"Avondale, Chicago.",48,Avondale,Chicago
DB7921,MASSACHUSETTS,4,1,Apartment,,0,"South End, Boston.",115,South End,Boston
OZ5857,ILLINOIS,4,3,Apartment,100.0,51,"Avondale, Chicago.",150,Avondale,Chicago


In [None]:
# 행방향 삭제 axis=0 / 열방향 삭제 axis=1
airbnb_df = airbnb_df.drop('location', axis=1)
airbnb_df.head(3)

Unnamed: 0_level_0,state,guests,beds,property_type,rating,n_reviews,price,neighborhood,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BF1406,Illinois,2,1,Apartment,96.0,55,48,Avondale,Chicago
DB7921,MASSACHUSETTS,4,1,Apartment,,0,115,South End,Boston
OZ5857,ILLINOIS,4,3,Apartment,100.0,51,150,Avondale,Chicago


---

## 불필요한 문자 제거하기

### 개념
문자열에 공백이나 문장부호등 불필요한 문자 제거

In [None]:
import pandas as pd

# 에어비앤비 데이터 불러오기
airbnb_df = pd.read_csv('./data/airbnb.csv')
airbnb_df['neighborhood'] = airbnb_df['location'].str.split(',').str[0]
airbnb_df['city'] = airbnb_df['location'].str.split(',').str[1]
airbnb_df = airbnb_df.drop('location', axis=1)
airbnb_df.head(3)

Unnamed: 0,id,state,guests,beds,property_type,rating,n_reviews,price,neighborhood,city
0,BF1406,Illinois,2,1,Apartment,96.0,55,48,Avondale,Chicago.
1,DB7921,MASSACHUSETTS,4,1,Apartment,,0,115,South End,Boston.
2,OZ5857,ILLINOIS,4,3,Apartment,100.0,51,150,Avondale,Chicago.


### 방법
#### 1. 앞뒤 공백 제거 : `strip()`

In [None]:
airbnb_df['city'].unique()

array([' Chicago.', ' Boston.', ' San Francisco.', ' Los Angeles.'],
      dtype=object)

In [None]:
airbnb_df['city'] = airbnb_df['city'].str.strip('.')
airbnb_df['city']

0             Chicago
1              Boston
2             Chicago
3       San Francisco
4              Boston
            ...      
708       Los Angeles
709       Los Angeles
710       Los Angeles
711           Chicago
712            Boston
Name: city, Length: 713, dtype: object

### 2. 문자 바꾸기 : `replace()`
- regex=False를 설정하면 단순한 문자 치환 가능

In [None]:
airbnb_df['city'] = airbnb_df['city'].str.replace('.',"",regex=False)
airbnb_df['city']

0             Chicago
1              Boston
2             Chicago
3       San Francisco
4              Boston
            ...      
708       Los Angeles
709       Los Angeles
710       Los Angeles
711           Chicago
712            Boston
Name: city, Length: 713, dtype: object

* 정규표현식 : `regex`(regular expression)
    - True : 기본값, 정규표현식으로 특정 패턴 처리 가능
    - False : 정규표현식 사용 X, 단순 문자 치환만

---

## 새로운 값 계산하기

### 1. 허리-엉덩이 비율 계산하기
- dataframe의 값을 사용하여 사칙 연산을 통해 새로운 컬럼 생성

In [None]:
import pandas as pd

patient_df = pd.read_csv("./data/patient.csv")

In [None]:
patient_df.head(3)

Unnamed: 0,age,sex,glucose,cholesterol,height,weight,hip,waist,diabetes
0,50,Male,385,140,1.75,78.0,41,37,Y
1,37,Female,67,214,1.63,65.8,42,34,N
2,43,Female,100,160,1.63,63.5,40,37,N


In [None]:
patient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          348 non-null    int64  
 1   sex          348 non-null    object 
 2   glucose      348 non-null    int64  
 3   cholesterol  348 non-null    int64  
 4   height       348 non-null    float64
 5   weight       348 non-null    float64
 6   hip          348 non-null    int64  
 7   waist        348 non-null    int64  
 8   diabetes     348 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 24.6+ KB


In [None]:
# 특정 형식 데이터만 추출
patient_df.select_dtypes(include=['int64','float64'])

Unnamed: 0,age,glucose,cholesterol,height,weight,hip,waist
0,50,385,140,1.75,78.0,41,37
1,37,67,214,1.63,65.8,42,34
2,43,100,160,1.63,63.5,40,37
3,32,90,176,1.60,114.3,58,45
4,20,71,164,1.83,65.8,36,29
...,...,...,...,...,...,...,...
343,40,87,218,1.85,90.7,41,38
344,66,174,188,1.73,95.3,48,45
345,31,69,183,1.68,86.2,47,41
346,44,84,202,1.73,71.2,37,33


In [None]:
# Series도 numpy 기반이기 때문에 그대로 연산 가능
patient_df['waist_hip_ratio'] = round((patient_df['waist'] / patient_df['hip']),2)
patient_df[['waist_hip_ratio']]

Unnamed: 0,waist_hip_ratio
0,0.90
1,0.81
2,0.92
3,0.78
4,0.81
...,...
343,0.93
344,0.94
345,0.87
346,0.89


In [None]:
#성별별 비만율
gender_whr_df = patient_df[['sex','waist_hip_ratio']]
male_df = gender_whr_df[gender_whr_df['sex'] == 'Male']
female_df = gender_whr_df[gender_whr_df['sex'] == 'Female']

len(male_df[male_df['waist_hip_ratio'] >= 0.9]) / len(male_df) * 100
len(female_df[female_df['waist_hip_ratio'] >= 0.9]) / len(female_df) * 100

30.88235294117647