# 프로젝트 개요

데이터 출처
* 서울 열린데이터 광장 (data.seoul.go.kr)

수집 데이터 내용
* 서울시 주민등록 인구 (동별) 통계
* 서울시 자치구 년도별 CCTV 설치 현황 

문제 인식
* 서울시의 cctv개수와 서울시 범죄율 간의 상관관계
* cctv 배치 기준 모호

분석 목적
* 서울시 cctv 설치 현황과 운영 파악 및 자치구 수요 수치화를 통한 cctv 우선 배치 및 선정

# 데이터 파일 불러오기

## csv

In [1]:
import pandas as pd
# csv파일 안에 한글이 포함되어있다면 인코딩 => UTF-8
cctv_seoul = pd.read_csv('./data/cctv_in_seoul.csv', encoding='utf-8', thousands=',')
cctv_seoul.dropna(inplace=True)

In [2]:
cctv_seoul.head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년
0,강 남 구,5221,2455,430,546,765,577,448
1,강 동 구,1879,824,59,144,194,273,385
2,강 북 구,1265,472,74,145,254,1,319
3,강 서 구,1617,492,230,187,190,264,254
4,관 악 구,3985,905,487,609,619,694,671


In [3]:
cctv_seoul.dtypes

기관명         object
소계           int64
2013년 이전     int64
2014년        int64
2015년        int64
2016년        int64
2017년        int64
2018년        int64
dtype: object

### 컬럼

In [4]:
cctv_seoul.columns

Index(['기관명', '소계', '2013년 이전', '2014년', '2015년', '2016년', '2017년', '2018년'], dtype='object')

In [5]:
cctv_seoul.columns[0]

'기관명'

In [6]:
cctv_seoul.columns[1:3]

Index(['소계', '2013년 이전'], dtype='object')

In [7]:
cctv_seoul.columns[:]

Index(['기관명', '소계', '2013년 이전', '2014년', '2015년', '2016년', '2017년', '2018년'], dtype='object')

In [8]:
cctv_seoul.columns[-1]

'2018년'

### 이름 바꾸기

* 칼럼 이름 바꾸기

: df.columns = ['a', 'b']

: df.rename(columns = {'old_nm' : 'new_nm'), inplace = True)


* 인덱스 이름 바꾸기

: df.index = ['a', 'b']

: df.rename(index = {'old_nm': 'new_nm'), inplace = True)

inplace=True는 데이터 변수의 내용을 갱신하라는 의미

In [9]:
cctv_seoul.rename(columns={'기관명':'지역구'}, inplace=True)
cctv_seoul

Unnamed: 0,지역구,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년
0,강 남 구,5221,2455,430,546,765,577,448
1,강 동 구,1879,824,59,144,194,273,385
2,강 북 구,1265,472,74,145,254,1,319
3,강 서 구,1617,492,230,187,190,264,254
4,관 악 구,3985,905,487,609,619,694,671
5,광 진 구,1581,595,87,64,21,468,346
6,구 로 구,3227,1420,187,268,326,540,486
7,금 천 구,1634,286,101,382,136,199,530
8,노 원 구,1906,801,80,461,298,110,156
9,도 봉 구,858,271,185,59,155,117,71


In [10]:
import pandas as pd
cctv_seoul = pd.read_csv('./data/cctv_in_seoul.csv', encoding='utf-8')
cctv_seoul.dropna(inplace=True)

## excel

In [11]:
pop_seoul = pd.read_excel('./data/population_in_seoul.xlsx', encoding='utf-8', thousands=',')
pop_seoul.head()

Unnamed: 0,기간,자치구,세대,인구,인구.1,인구.2,인구.3,인구.4,인구.5,인구.6,인구.7,인구.8,세대당인구,65세이상고령자
0,기간,자치구,세대,합계,합계,합계,한국인,한국인,한국인,등록외국인,등록외국인,등록외국인,세대당인구,65세이상고령자
1,기간,자치구,세대,계,남자,여자,계,남자,여자,계,남자,여자,세대당인구,65세이상고령자
2,2019.1/4,합계,4290922,10054979,4909387,5145592,9770216,4772134,4998082,284763,137253,147510,2.28,1436125
3,2019.1/4,종로구,73914,162913,78963,83950,152778,74536,78242,10135,4427,5708,2.07,26981
4,2019.1/4,중구,61800,135836,66720,69116,125942,61992,63950,9894,4728,5166,2.04,22421


### 불필요한 헤더 제거

##### 불필요한 hearder를 제거

In [12]:
pop_seoul = pd.read_excel('./data/population_in_seoul.xlsx', 
                          header = 2,
                          encoding='utf-8', 
                          thousands=',')
pop_seoul.head()

Unnamed: 0,기간,자치구,세대,계,남자,여자,계.1,남자.1,여자.1,계.2,남자.2,여자.2,세대당인구,65세이상고령자
0,2019.1/4,합계,4290922,10054979,4909387,5145592,9770216,4772134,4998082,284763,137253,147510,2.28,1436125
1,2019.1/4,종로구,73914,162913,78963,83950,152778,74536,78242,10135,4427,5708,2.07,26981
2,2019.1/4,중구,61800,135836,66720,69116,125942,61992,63950,9894,4728,5166,2.04,22421
3,2019.1/4,용산구,109413,245139,119597,125542,229168,110626,118542,15971,8971,7000,2.09,38049
4,2019.1/4,성동구,137247,314608,154011,160597,306404,150287,156117,8204,3724,4480,2.23,43076


### 필요한 컬럼만 선택

##### 필요한 컬럼만 선택

In [13]:
pop_seoul = pd.read_excel('./data/population_in_seoul.xlsx', 
                          header = 2,
                          parse_cols = 'B, D, G, J, N',
                          encoding='utf-8', 
                          thousands=',')
pop_seoul.head()

Unnamed: 0,자치구,계,계.1,계.2,65세이상고령자
0,합계,10054979,9770216,284763,1436125
1,종로구,162913,152778,10135,26981
2,중구,135836,125942,9894,22421
3,용산구,245139,229168,15971,38049
4,성동구,314608,306404,8204,43076


### 컬럼 이름 변경

##### rename을 이용, 컬럼 이름 변경

In [14]:
pop_seoul.rename(columns={pop_seoul.columns[0] : '지역별',
                          pop_seoul.columns[1] : '인구수',
                          pop_seoul.columns[2] : '한국인',
                          pop_seoul.columns[3] : '외국인',
                          pop_seoul.columns[4] : '고령자'},
                          inplace = True)
pop_seoul.head()

Unnamed: 0,지역별,인구수,한국인,외국인,고령자
0,합계,10054979,9770216,284763,1436125
1,종로구,162913,152778,10135,26981
2,중구,135836,125942,9894,22421
3,용산구,245139,229168,15971,38049
4,성동구,314608,306404,8204,43076


# pandas 기초

In [15]:
import pandas as pd
import numpy as np

## Series : pandas의 데이터 유형 중 기초

In [16]:
s = pd.Series([1, 3, 5, np.nan, 6, 8]) # NaN(Not A Number)
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [17]:
s1 = pd.Series([10, 20, 30, 40, 50, 100])
s1

0     10
1     20
2     30
3     40
4     50
5    100
dtype: int64

## 날짜형 데이터

In [18]:
dates = pd.date_range('20200301', periods=6)
dates

DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
               '2020-03-05', '2020-03-06'],
              dtype='datetime64[ns]', freq='D')

In [19]:
dates_xmas = pd.date_range('20201224', periods=8)
dates_xmas

DatetimeIndex(['2020-12-24', '2020-12-25', '2020-12-26', '2020-12-27',
               '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', freq='D')

## DataFrame 만들어보기

In [20]:
# rand: 0부터 1사이의 균일 분포
# randn: 가우시안 표준 정규 분포
# randint: 균일 분포의 정수 난수
df = pd.DataFrame(np.random.randn(8,4),
                  index=dates_xmas,
                  columns=['A','B','C','D']
                 )
df.head(3)

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536


## 인덱스 확인

In [21]:
df.index

DatetimeIndex(['2020-12-24', '2020-12-25', '2020-12-26', '2020-12-27',
               '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', freq='D')

## 컬럼 확인

In [22]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

## 내용 확인

In [23]:
df.values

array([[-0.740713  , -0.69508564,  0.25577492, -1.29982729],
       [-0.23885178, -0.25893705, -0.84612734,  0.36535602],
       [-0.40319428, -0.61424187, -0.41804723,  0.98053633],
       [ 0.89484423, -2.78881061,  0.10295139,  0.67539441],
       [ 1.52549682, -1.05144196,  0.01683752,  1.73987082],
       [-0.0175691 ,  0.62286138, -0.8270699 , -0.0036266 ],
       [-0.46684094, -0.42770617,  1.80507327, -1.45540419],
       [ 1.20565203, -0.74311471,  0.04489536, -0.39025868]])

## 개요 확인

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8 entries, 2020-12-24 to 2020-12-31
Freq: D
Data columns (total 4 columns):
A    8 non-null float64
B    8 non-null float64
C    8 non-null float64
D    8 non-null float64
dtypes: float64(4)
memory usage: 320.0 bytes


## 통계적 개요 확인

In [25]:
df.describe()

Unnamed: 0,A,B,C,D
count,8.0,8.0,8.0,8.0
mean,0.219853,-0.74456,0.016786,0.076505
std,0.860342,0.963523,0.837436,1.101394
min,-0.740713,-2.788811,-0.846127,-1.455404
25%,-0.419106,-0.820197,-0.520303,-0.617651
50%,-0.12821,-0.654664,0.030866,0.180865
75%,0.972546,-0.385514,0.141157,0.75168
max,1.525497,0.622861,1.805073,1.739871


## 컬럼 기준 정렬

In [26]:
df_sort = df.sort_values(by='B', ascending=False)

In [27]:
df_sort

Unnamed: 0,A,B,C,D
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-31,1.205652,-0.743115,0.044895,-0.390259
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-27,0.894844,-2.788811,0.102951,0.675394


## 행의 범위 지정

In [28]:
df['A']

2020-12-24   -0.740713
2020-12-25   -0.238852
2020-12-26   -0.403194
2020-12-27    0.894844
2020-12-28    1.525497
2020-12-29   -0.017569
2020-12-30   -0.466841
2020-12-31    1.205652
Freq: D, Name: A, dtype: float64

In [29]:
df[2:6]

Unnamed: 0,A,B,C,D
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627


In [30]:
df['20201227':'20201231']

Unnamed: 0,A,B,C,D
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404
2020-12-31,1.205652,-0.743115,0.044895,-0.390259


In [31]:
df['20201227':]

Unnamed: 0,A,B,C,D
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404
2020-12-31,1.205652,-0.743115,0.044895,-0.390259


## 변수를 이용해서 특정 날짜의 데이터만 보기

In [32]:
df.loc[dates_xmas[0]]

A   -0.740713
B   -0.695086
C    0.255775
D   -1.299827
Name: 2020-12-24 00:00:00, dtype: float64

In [33]:
df.loc[dates_xmas[3:6]]

Unnamed: 0,A,B,C,D
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627


In [34]:
df.loc[dates_xmas[:]]

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404
2020-12-31,1.205652,-0.743115,0.044895,-0.390259


In [35]:
df.loc['20201224':'20201227']

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536
2020-12-27,0.894844,-2.788811,0.102951,0.675394


In [36]:
df.loc[dates_xmas[0],['A','B']]

A   -0.740713
B   -0.695086
Name: 2020-12-24 00:00:00, dtype: float64

In [37]:
df.loc[dates_xmas[3:6],['A','B']]

Unnamed: 0,A,B
2020-12-27,0.894844,-2.788811
2020-12-28,1.525497,-1.051442
2020-12-29,-0.017569,0.622861


In [38]:
df.loc[dates_xmas[:],['A','B']]

Unnamed: 0,A,B
2020-12-24,-0.740713,-0.695086
2020-12-25,-0.238852,-0.258937
2020-12-26,-0.403194,-0.614242
2020-12-27,0.894844,-2.788811
2020-12-28,1.525497,-1.051442
2020-12-29,-0.017569,0.622861
2020-12-30,-0.466841,-0.427706
2020-12-31,1.205652,-0.743115


In [39]:
df.loc['20201224':'20201227',['A','B']]

Unnamed: 0,A,B
2020-12-24,-0.740713,-0.695086
2020-12-25,-0.238852,-0.258937
2020-12-26,-0.403194,-0.614242
2020-12-27,0.894844,-2.788811


## 행과 열의 번호를 이용해서 데이터에 접근

In [40]:
df.iloc[3]

A    0.894844
B   -2.788811
C    0.102951
D    0.675394
Name: 2020-12-27 00:00:00, dtype: float64

In [41]:
df.iloc[1:4]

Unnamed: 0,A,B,C,D
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536
2020-12-27,0.894844,-2.788811,0.102951,0.675394


In [42]:
df.iloc[1:4,1:3]

Unnamed: 0,B,C
2020-12-25,-0.258937,-0.846127
2020-12-26,-0.614242,-0.418047
2020-12-27,-2.788811,0.102951


In [43]:
df.iloc[[1,3,7],[0,3]]

Unnamed: 0,A,D
2020-12-25,-0.238852,0.365356
2020-12-27,0.894844,0.675394
2020-12-31,1.205652,-0.390259


## 특정 조건 만족하는 데이터

In [44]:
df[df.A > 0.5]

Unnamed: 0,A,B,C,D
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-31,1.205652,-0.743115,0.044895,-0.390259


In [45]:
df[df['A'] > 0.5]

Unnamed: 0,A,B,C,D
2020-12-27,0.894844,-2.788811,0.102951,0.675394
2020-12-28,1.525497,-1.051442,0.016838,1.739871
2020-12-31,1.205652,-0.743115,0.044895,-0.390259


In [46]:
df[df > 0.5]

Unnamed: 0,A,B,C,D
2020-12-24,,,,
2020-12-25,,,,
2020-12-26,,,,0.980536
2020-12-27,0.894844,,,0.675394
2020-12-28,1.525497,,,1.739871
2020-12-29,,0.622861,,
2020-12-30,,,1.805073,
2020-12-31,1.205652,,,


## 데이터 복사

##### DataFrame을 복사할 때 '='기호를 이용해서 복사하면 실제 데이터의 내용이 복사되는 것이 아니라 데이터 위치만 복사되기 때문에 원본 데이터는 하나만 있게 됨

* df를 사용할 수 없음, df_copy에 따라 데이터의 내용이 같이 변함(데이터는 하나)

In [47]:
# df_copy = df

In [48]:
# df_copy['E'] = ['one','one','two','three','four','three','seven','eight']

In [49]:
# df_copy

##### 데이터의 내용까지 복사 : copy()

In [50]:
df_copy_1 = df.copy()

In [51]:
df_copy_1['E'] = ['one','one','two','three','four','three','seven','eight']

In [52]:
df_copy_1

Unnamed: 0,A,B,C,D,E
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827,one
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356,one
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536,two
2020-12-27,0.894844,-2.788811,0.102951,0.675394,three
2020-12-28,1.525497,-1.051442,0.016838,1.739871,four
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627,three
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404,seven
2020-12-31,1.205652,-0.743115,0.044895,-0.390259,eight


## isin()

In [53]:
df_copy_1['E'].isin(['one','four'])

2020-12-24     True
2020-12-25     True
2020-12-26    False
2020-12-27    False
2020-12-28     True
2020-12-29    False
2020-12-30    False
2020-12-31    False
Freq: D, Name: E, dtype: bool

In [54]:
df_copy_1[df_copy_1['E'].isin(['one','four'])]

Unnamed: 0,A,B,C,D,E
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827,one
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356,one
2020-12-28,1.525497,-1.051442,0.016838,1.739871,four


## 통계 느낌의 데이터 - apply

In [55]:
df.apply(np.cumsum) # 누적합

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-25,-0.979565,-0.954023,-0.590352,-0.934471
2020-12-26,-1.382759,-1.568265,-1.0084,0.046065
2020-12-27,-0.487915,-4.357075,-0.905448,0.721459
2020-12-28,1.037582,-5.408517,-0.888611,2.46133
2020-12-29,1.020013,-4.785656,-1.715681,2.457704
2020-12-30,0.553172,-5.213362,0.089393,1.002299
2020-12-31,1.758824,-5.956477,0.134288,0.612041


In [56]:
df_copy_1

Unnamed: 0,A,B,C,D,E
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827,one
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356,one
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536,two
2020-12-27,0.894844,-2.788811,0.102951,0.675394,three
2020-12-28,1.525497,-1.051442,0.016838,1.739871,four
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627,three
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404,seven
2020-12-31,1.205652,-0.743115,0.044895,-0.390259,eight


In [57]:
df.apply(np.cumsum, axis=0)

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827
2020-12-25,-0.979565,-0.954023,-0.590352,-0.934471
2020-12-26,-1.382759,-1.568265,-1.0084,0.046065
2020-12-27,-0.487915,-4.357075,-0.905448,0.721459
2020-12-28,1.037582,-5.408517,-0.888611,2.46133
2020-12-29,1.020013,-4.785656,-1.715681,2.457704
2020-12-30,0.553172,-5.213362,0.089393,1.002299
2020-12-31,1.758824,-5.956477,0.134288,0.612041


In [58]:
df.apply(np.cumsum, axis=1)

Unnamed: 0,A,B,C,D
2020-12-24,-0.740713,-1.435799,-1.180024,-2.479851
2020-12-25,-0.238852,-0.497789,-1.343916,-0.97856
2020-12-26,-0.403194,-1.017436,-1.435483,-0.454947
2020-12-27,0.894844,-1.893966,-1.791015,-1.115621
2020-12-28,1.525497,0.474055,0.490892,2.230763
2020-12-29,-0.017569,0.605292,-0.221778,-0.225404
2020-12-30,-0.466841,-0.894547,0.910526,-0.544878
2020-12-31,1.205652,0.462537,0.507433,0.117174


In [59]:
df.apply(np.average, axis=0) # 열 기준

A    0.219853
B   -0.744560
C    0.016786
D    0.076505
dtype: float64

In [60]:
df.apply(np.average, axis=1) # 행 기준

2020-12-24   -0.619963
2020-12-25   -0.244640
2020-12-26   -0.113737
2020-12-27   -0.278905
2020-12-28    0.557691
2020-12-29   -0.056351
2020-12-30   -0.136220
2020-12-31    0.029294
Freq: D, dtype: float64

In [61]:
def get_divide(df):
    return (df['A'] + df['B'] + df['C'] + df['D']) / 4

In [62]:
df.apply(get_divide, axis=1)

2020-12-24   -0.619963
2020-12-25   -0.244640
2020-12-26   -0.113737
2020-12-27   -0.278905
2020-12-28    0.557691
2020-12-29   -0.056351
2020-12-30   -0.136220
2020-12-31    0.029294
Freq: D, dtype: float64

In [63]:
df['average'] = df.apply(get_divide, axis=1)

In [64]:
df

Unnamed: 0,A,B,C,D,average
2020-12-24,-0.740713,-0.695086,0.255775,-1.299827,-0.619963
2020-12-25,-0.238852,-0.258937,-0.846127,0.365356,-0.24464
2020-12-26,-0.403194,-0.614242,-0.418047,0.980536,-0.113737
2020-12-27,0.894844,-2.788811,0.102951,0.675394,-0.278905
2020-12-28,1.525497,-1.051442,0.016838,1.739871,0.557691
2020-12-29,-0.017569,0.622861,-0.82707,-0.003627,-0.056351
2020-12-30,-0.466841,-0.427706,1.805073,-1.455404,-0.13622
2020-12-31,1.205652,-0.743115,0.044895,-0.390259,0.029294


##### 최대값과 최소값의 차이(혹은 거리) one-line 함수인 lambda 사용

In [65]:
df.apply(lambda x: x.max() - x.min())

A          2.266210
B          3.411672
C          2.651201
D          3.195275
average    1.177654
dtype: float64

In [66]:
df.apply(lambda x: x.max() - x.min(), axis=1)

2020-12-24    1.555602
2020-12-25    1.211483
2020-12-26    1.594778
2020-12-27    3.683655
2020-12-28    2.791313
2020-12-29    1.449931
2020-12-30    3.260477
2020-12-31    1.948767
Freq: D, dtype: float64

## 재색인(reindex)
* 새로운 색인에 맞도록 객체를 새로 생성하는 기능, row, column, index 모두 변경 가능

##### Series 객체의 reindex

In [67]:
obj = pd.Series([2.3, 4.5, -4.1, 3.5], index=["D","A","V","E"])

In [68]:
obj

D    2.3
A    4.5
V   -4.1
E    3.5
dtype: float64

In [69]:
obj = obj.reindex(["D","A","J","V","E","K"])

In [70]:
obj

D    2.3
A    4.5
J    NaN
V   -4.1
E    3.5
K    NaN
dtype: float64

In [71]:
obj = obj.reindex(["D","A","J","V","E","K","G","U"], fill_value=0.0)

In [72]:
obj

D    2.3
A    4.5
J    NaN
V   -4.1
E    3.5
K    NaN
G    0.0
U    0.0
dtype: float64

##### DataFrame reindex

In [73]:
df_re = pd.DataFrame(np.arange(9).reshape(3,3),
                     index=["C","A","T"],
                     columns=["D","O","G"])

In [74]:
df_re

Unnamed: 0,D,O,G
C,0,1,2
A,3,4,5
T,6,7,8


In [75]:
df_re = df_re.reindex(["C","A","T","S"])

In [76]:
df_re

Unnamed: 0,D,O,G
C,0.0,1.0,2.0
A,3.0,4.0,5.0
T,6.0,7.0,8.0
S,,,


In [77]:
df_re = df_re.reindex(columns=["D","O","G","B"])

In [78]:
df_re

Unnamed: 0,D,O,G,B
C,0.0,1.0,2.0,
A,3.0,4.0,5.0,
T,6.0,7.0,8.0,
S,,,,


In [79]:
df_re2 = pd.DataFrame(np.arange(16).reshape(4,4),
                     index=["a","b","c","d"],
                     columns=["x","y","z","w"])

In [80]:
df_re2

Unnamed: 0,x,y,z,w
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [81]:
df_re3 = df_re2.reindex(index=["a","b","c","e","d"], columns=["x","y","z","w","q"])
df_re3

Unnamed: 0,x,y,z,w,q
a,0.0,1.0,2.0,3.0,
b,4.0,5.0,6.0,7.0,
c,8.0,9.0,10.0,11.0,
e,,,,,
d,12.0,13.0,14.0,15.0,


## drop()
* index명을 사용해서 행을 삭제할 때는 ()안에 삭제할 행의 이름을 입력
* 주의할 점은 index명을 사용한다는 것이다. 

##### 'index명' 또는 'index순서'로 행 삭제

In [82]:
df9 = pd.DataFrame(np.arange(16).reshape(4,4),
                   index=['seoul','busan','daegu','incheon'],
                   columns=['one','two','three','four'])
df9

Unnamed: 0,one,two,three,four
seoul,0,1,2,3
busan,4,5,6,7
daegu,8,9,10,11
incheon,12,13,14,15


In [83]:
df_row = df9.drop(['seoul','busan'])
df_row

Unnamed: 0,one,two,three,four
daegu,8,9,10,11
incheon,12,13,14,15


##### column과 axis를 같이 줘야 column이 삭제됨

In [84]:
df_column = df9.drop(['one','two'], axis=1)
df_column

Unnamed: 0,three,four
seoul,2,3
busan,6,7
daegu,10,11
incheon,14,15


## 문자열 컬럼을 숫자형으로

* pd.to_numeric()
* astype()

##### pd.to_numeric()

In [85]:
s = pd.Series(['1.0', '2', '-3'])
s.dtypes

dtype('O')

In [86]:
s1 = pd.to_numeric(s)
s1.dtypes

dtype('float64')

In [87]:
pd.to_numeric(s, downcast='float')

0    1.0
1    2.0
2   -3.0
dtype: float32

In [88]:
pd.to_numeric(s, downcast='signed')

0    1
1    2
2   -3
dtype: int8

In [89]:
s = pd.Series(['apple', '1.0', '2', -3])

In [90]:
pd.to_numeric(s, errors='ignore')

0    apple
1      1.0
2        2
3       -3
dtype: object

In [91]:
pd.to_numeric(s, errors='coerce')

0    NaN
1    1.0
2    2.0
3   -3.0
dtype: float64

In [92]:
df_str = pd.DataFrame({'col_str' : ['1','2','3','4','5']})

In [93]:
df_str

Unnamed: 0,col_str
0,1
1,2
2,3
3,4
4,5


In [94]:
df_str.dtypes

col_str    object
dtype: object

In [95]:
df_str['col_int'] = pd.to_numeric(df_str['col_str'])

In [96]:
df_str.dtypes

col_str    object
col_int     int64
dtype: object

In [97]:
df_str

Unnamed: 0,col_str,col_int
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


##### astype() - Dataframe 내 모든 문자열 칼럼을 float로 한꺼번에 변환하기

In [98]:
df4 = pd.DataFrame({'col_str_1': ['1', '2', '3'], 
                   'col_str_2': ['4.1', '5.5', '6.0']})
df4

Unnamed: 0,col_str_1,col_str_2
0,1,4.1
1,2,5.5
2,3,6.0


In [99]:
df4.dtypes

col_str_1    object
col_str_2    object
dtype: object

In [100]:
df5 = df4.astype(float)
df5

Unnamed: 0,col_str_1,col_str_2
0,1.0,4.1
1,2.0,5.5
2,3.0,6.0


In [101]:
df5.dtypes

col_str_1    float64
col_str_2    float64
dtype: object

# 판다스 이용해서 데이터 파악

In [102]:
import pandas as pd
cctv_seoul = pd.read_csv('./data/cctv_in_seoul.csv', encoding='utf-8', thousands=',')
cctv_seoul.dropna(inplace=True)
cctv_seoul.head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년
0,강 남 구,5221,2455,430,546,765,577,448
1,강 동 구,1879,824,59,144,194,273,385
2,강 북 구,1265,472,74,145,254,1,319
3,강 서 구,1617,492,230,187,190,264,254
4,관 악 구,3985,905,487,609,619,694,671


##### cctv의 전체 개수가 가장 작은 구 : 도봉구 < 강북구 < 종로구 < 중구 < 중랑구

In [103]:
cctv_seoul.sort_values(by='소계', ascending=True).head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년
9,도 봉 구,858,271,185,59,155,117,71
2,강 북 구,1265,472,74,145,254,1,319
22,종 로 구,1471,614,132,195,148,281,101
23,중 구,1544,304,80,245,270,317,328
24,중 랑 구,1577,509,770,102,121,66,9


##### cctv의 개수가 가장 많은 구 : 강남구 > 관악구 > 구로구 > 동내문구 > 성북구

In [104]:
cctv_seoul.sort_values(by='소계', ascending=False).head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년
0,강 남 구,5221,2455,430,546,765,577,448
4,관 악 구,3985,905,487,609,619,694,671
6,구 로 구,3227,1420,187,268,326,540,486
10,동대문구,3073,1070,1326,111,233,136,197
16,성 북 구,3003,1167,241,279,388,285,643


##### 2014년부터 2018년까지 최근 5년간 cctv 증가율

In [114]:
cctv_seoul['최근증가율'] = ((cctv_seoul['2018년'] + cctv_seoul['2017년'] + cctv_seoul['2016년'] + \
                       cctv_seoul['2015년'] + cctv_seoul['2014년']) / cctv_seoul['2013년 이전']) * 100

In [113]:
cctv_seoul.head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년,최근증가율
0,강 남 구,5221,2455,430,546,765,577,448,112.668024
1,강 동 구,1879,824,59,144,194,273,385,128.033981
2,강 북 구,1265,472,74,145,254,1,319,168.008475
3,강 서 구,1617,492,230,187,190,264,254,228.658537
4,관 악 구,3985,905,487,609,619,694,671,340.331492


##### 최근 5년간 cctv가 그 이전 대비 많이 증가한 구 : 금천구 > 영등포구 > 중구 > 동작구 > 관악구

In [112]:
cctv_seoul.sort_values(by='최근증가율', ascending=False).head()

Unnamed: 0,기관명,소계,2013년 이전,2014년,2015년,2016년,2017년,2018년,최근증가율
7,금 천 구,1634,286,101,382,136,199,530,471.328671
19,영등포구,2495,459,217,366,289,371,793,443.572985
23,중 구,1544,304,80,245,270,317,328,407.894737
11,동 작 구,1780,360,503,130,254,278,255,394.444444
4,관 악 구,3985,905,487,609,619,694,671,340.331492
