In [2]:
# import 모음
import pandas as pd
import shutil
import numpy as np
import matplotlib.pyplot as plt # 파이썬에서 데이터 시각화를 위해 사용하는 기본 라이브러리
import seaborn as sns # 그래프의 디자인을 다듬어주는 라이브러리


# 1. 판다스 Series와 DataFrame

In [1]:
import pandas as pd

In [8]:
# 1. 판다스 Series

# 1차원 데이터 구조로 인덱스를 포함하는 배열 형태
series_example = pd.Series([10, 20, 30, 40], index=['A', 'B', 'C', 'D'])
print(series_example)
print('')
print(series_example['A'])
print(series_example['C'])

A    10
B    20
C    30
D    40
dtype: int64

10
30


In [8]:
# 2. 판다스 DataFrame

# DataFrame은 행과 열로 구성된 테이블 형태의 2차원 데이터 구조
# 여러 개의 Series가 모여서 DataFrame을 구성한다.

dataframe_example = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['뉴욕', '로스엔젤레스', '시카고', '휴스톤']
})

display(dataframe_example)
print(dataframe_example)
print('')
print(dataframe_example['Name'])
print('')
print(dataframe_example['City'])

Unnamed: 0,Name,Age,City
0,Alice,25,뉴욕
1,Bob,30,로스엔젤레스
2,Charlie,35,시카고
3,David,40,휴스톤


      Name  Age    City
0    Alice   25      뉴욕
1      Bob   30  로스엔젤레스
2  Charlie   35     시카고
3    David   40     휴스톤

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object

0        뉴욕
1    로스엔젤레스
2       시카고
3       휴스톤
Name: City, dtype: object


### 2. california_housing_train 사용하기

In [5]:
# csv 파일을 DataFrame 형태로 읽기
train = pd.read_csv('sample_data/california_housing_train.csv')
train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,66900
1,-114.47,34.40,19,7650,1901,1129,463,1.8200,80100
2,-114.56,33.69,17,720,174,333,117,1.6509,85700
3,-114.57,33.64,14,1501,337,515,226,3.1917,73400
4,-114.57,33.57,20,1454,326,624,262,1.9250,65500
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52,2217,394,907,369,2.3571,111400
16996,-124.27,40.69,36,2349,528,1194,465,2.5179,79000
16997,-124.30,41.84,17,2677,531,1244,456,3.0313,103600
16998,-124.30,41.80,19,2672,552,1298,478,1.9797,85800


In [6]:
# 특정 열 가져오기
train['housing_median_age']

0        15
1        19
2        17
3        14
4        20
         ..
16995    52
16996    36
16997    17
16998    19
16999    52
Name: housing_median_age, Length: 17000, dtype: int64

In [11]:
train['population']

0        1015
1        1129
2         333
3         515
4         624
         ... 
16995     907
16996    1194
16997    1244
16998    1298
16999     806
Name: population, Length: 17000, dtype: int64

In [12]:
# 조건에 따른 필터링
# housing_median_age가 10미만 데이터만 가져오기
train_min_10 = train[train['housing_median_age'] < 10]
train_min_10

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
58,-115.52,32.67,6,2804,581,2807,594,2.0625,67700
75,-115.55,32.78,5,2652,606,1767,536,2.8025,84300
95,-115.58,32.81,5,805,143,458,143,4.4750,96300
98,-115.58,32.78,5,2494,414,1416,421,5.7843,110100
100,-115.59,32.79,8,2183,307,1000,287,6.3814,159900
...,...,...,...,...,...,...,...,...,...
16679,-122.79,38.48,7,6837,1417,3468,1405,3.1662,191000
16680,-122.79,38.42,9,4967,885,2581,915,5.0380,185600
16692,-122.82,38.55,8,6190,1088,2967,1000,3.8616,195100
16776,-123.00,38.33,8,3223,637,851,418,5.6445,364800


### 3. 데이터 프레임을 활용해서 housing_median_age가 10 미만인 데이터 삭제

In [7]:
train = pd.read_csv('sample_data/california_housing_train.csv')

new_data = [] # housing_median_age가 10미만인 데이터를 삭제한 결과 저장

# iterrows(): 데이터프레임의 각 행을 반복해서 행별 데이터를 가져옴
# index: 행의 인덱스
# row: 행의 데이터를 판다스 Series 형태로 반환

for index, row in train.iterrows():
    # print(index, row)
    if row.housing_median_age >= 10: # housing_median_age가 10이상인 데이터만
        new_data.append(row)

new_data

[longitude              -114.3100
 latitude                 34.1900
 housing_median_age       15.0000
 total_rooms            5612.0000
 total_bedrooms         1283.0000
 population             1015.0000
 households              472.0000
 median_income             1.4936
 median_house_value    66900.0000
 Name: 0, dtype: float64,
 longitude              -114.47
 latitude                 34.40
 housing_median_age       19.00
 total_rooms            7650.00
 total_bedrooms         1901.00
 population             1129.00
 households              463.00
 median_income             1.82
 median_house_value    80100.00
 Name: 1, dtype: float64,
 longitude              -114.5600
 latitude                 33.6900
 housing_median_age       17.0000
 total_rooms             720.0000
 total_bedrooms          174.0000
 population              333.0000
 households              117.0000
 median_income             1.6509
 median_house_value    85700.0000
 Name: 2, dtype: float64,
 longitude            

In [8]:
new_train = pd.DataFrame(new_data)
new_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [16]:
train = pd.read_csv('sample_data/california_housing_train.csv')
print(train.loc[58]) # 58번째 행 데이터 조회

longitude              -115.5200
latitude                 32.6700
housing_median_age        6.0000
total_rooms            2804.0000
total_bedrooms          581.0000
population             2807.0000
households              594.0000
median_income             2.0625
median_house_value    67700.0000
Name: 58, dtype: float64


In [18]:
# train.loc[행, 열]: 행과 열을 동시에 선택
# train.loc[58, 'housing_median_age']

# housing_median_age가 10미만인 행을 가져오는데 housing_median_age 컬럼값만 가져온다
train.loc[train['housing_median_age'] < 10, 'housing_median_age']

58       6
75       5
95       5
98       5
100      8
        ..
16679    7
16680    9
16692    8
16776    8
16854    9
Name: housing_median_age, Length: 1087, dtype: int64

In [19]:
train.loc[train['housing_median_age'] < 10, 'housing_median_age'] * 10

58       60
75       50
95       50
98       50
100      80
         ..
16679    70
16680    90
16692    80
16776    80
16854    90
Name: housing_median_age, Length: 1087, dtype: int64

In [20]:
# 10미만인 housing_median_age 값들에 10 곱하기
train.loc[train['housing_median_age'] < 10, 'housing_median_age'] = train.loc[train['housing_median_age'] < 10, 'housing_median_age'] * 10
print(train.loc[58]) # 58번행 값을 다시 조회

longitude              -115.5200
latitude                 32.6700
housing_median_age       60.0000
total_rooms            2804.0000
total_bedrooms          581.0000
population             2807.0000
households              594.0000
median_income             2.0625
median_house_value    67700.0000
Name: 58, dtype: float64


### 5. 데이터 생성

In [22]:
train = pd.read_csv('sample_data/california_housing_train.csv')
print(train.loc[58])

longitude              -115.5200
latitude                 32.6700
housing_median_age        6.0000
total_rooms            2804.0000
total_bedrooms          581.0000
population             2807.0000
households              594.0000
median_income             2.0625
median_house_value    67700.0000
Name: 58, dtype: float64


In [25]:
train.loc[train['housing_median_age'] < 10, 'housing_median_age']

58       6
75       5
95       5
98       5
100      8
        ..
16679    7
16680    9
16692    8
16776    8
16854    9
Name: housing_median_age, Length: 1087, dtype: int64

In [26]:
import numpy as np

# np.log() 자연로그 계산
np.log(60)

4.0943445622221

In [27]:
# housing_median_age가 10미만인 housing_median_age값에 10을 곱한 후 자연로그 값을 구함

np.log(train.loc[train['housing_median_age'] < 10, 'housing_median_age'] * 10)

58       4.094345
75       3.912023
95       3.912023
98       3.912023
100      4.382027
           ...   
16679    4.248495
16680    4.499810
16692    4.382027
16776    4.382027
16854    4.499810
Name: housing_median_age, Length: 1087, dtype: float64

In [28]:
train.loc[train['housing_median_age'] < 10, 'housing_median_age_log'] = np.log(train.loc[train['housing_median_age'] < 10, 'housing_median_age'] * 10)

In [29]:
train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,housing_median_age_log
0,-114.31,34.19,15,5612,1283,1015,472,1.4936,66900,
1,-114.47,34.40,19,7650,1901,1129,463,1.8200,80100,
2,-114.56,33.69,17,720,174,333,117,1.6509,85700,
3,-114.57,33.64,14,1501,337,515,226,3.1917,73400,
4,-114.57,33.57,20,1454,326,624,262,1.9250,65500,
...,...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52,2217,394,907,369,2.3571,111400,
16996,-124.27,40.69,36,2349,528,1194,465,2.5179,79000,
16997,-124.30,41.84,17,2677,531,1244,456,3.0313,103600,
16998,-124.30,41.80,19,2672,552,1298,478,1.9797,85800,


### 6. Pyplot을 사용한 그래프 생성

In [4]:
import matplotlib.pyplot as plt # 파이썬에서 데이터 시각화를 위해 사용하는 기본 라이브러리
import seaborn as sns # 그래프의 디자인을 다듬어주는 라이브러리

In [11]:
# train[col3].plot(kind='hist')