# Data Analysis: Pandas DataFrame 개요
API Reference: https://pandas.pydata.org/docs/reference/frame.html

## 1. 라이브러리 및 파일 연결

In [14]:
# 라이브러리 연결
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

 # 파일(csv, etc.) 연결
file_path = "C:\\Users\\juyou\\Desktop\\pandas_test\\data_csv\\경찰청_전국 범죄 발생 및 검거 현황_20211231.csv"
df = pd.read_csv(file_path, encoding='cp949')
# unicodeescape error in position 2-3: /,\ to \\
# decoding error: read_csv(encoding='cp949')

"""
See also:
pandas.DataFrame.from_records
pandas.DataFrame.from_dict
pandas.read_table
pandas.read_clipboard
"""


'\nSee also:\npandas.DataFrame.from_records\npandas.DataFrame.from_dict\npandas.read_table\npandas.read_clipboard\n'

## 2. Pandas로 불러온 데이터의 정보 확인

### 1) pandas.DataFrame.info([verbose, buf, max_cols, ...])
Print a concise summary of a DataFrame. <br/>
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html#pandas.DataFrame.info


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   범죄대분류    473 non-null    object
 1   범죄중분류    473 non-null    object
 2   범죄소분류    473 non-null    object
 3   발생       473 non-null    int64 
 4   검거       473 non-null    int64 
 5   검거인원(남)  473 non-null    int64 
 6   검거인원(여)  473 non-null    int64 
 7   불상       473 non-null    int64 
 8   법인체      473 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 33.4+ KB


### 2) pandas.DataFrame.columns
The column labels of the DataFrame.


In [16]:
df.columns

Index(['범죄대분류', '범죄중분류', '범죄소분류', '발생', '검거', '검거인원(남)', '검거인원(여)', '불상',
       '법인체'],
      dtype='object')

### 3) pandas.DataFrame.head([n])
Return the first n rows.


In [17]:
df.head(3)

Unnamed: 0,범죄대분류,범죄중분류,범죄소분류,발생,검거,검거인원(남),검거인원(여),불상,법인체
0,강력범죄,살인기수,살인,204,196,187,54,4,1
1,강력범죄,살인기수,영아살해,5,5,0,5,0,0
2,강력범죄,살인기수,존속살해,26,27,24,3,0,0


### 4) pandas.DataFrame.shape
Return a tuple representing the dimensionality of the DataFrame.

In [18]:
df.shape

(473, 9)

### 5) pandas.DataFrame.describe
Generate descriptive stats: count, mean, std, etc.
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe

In [19]:
df.describe()

Unnamed: 0,발생,검거,검거인원(남),검거인원(여),불상,법인체
count,473.0,473.0,473.0,473.0,473.0,473.0
mean,3022.887949,2403.097252,2119.196617,589.48203,167.44186,21.811839
std,17450.528095,13094.894755,10635.876358,3021.523538,2029.422668,117.143095
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,3.0,4.0,0.0,0.0,0.0
50%,70.0,57.0,66.0,7.0,0.0,0.0
75%,677.0,523.0,513.0,89.0,11.0,2.0
max,269825.0,170980.0,125221.0,38280.0,43082.0,1539.0


### 6) pandas.DataFrame.value_counts
Return a <u>Series</u> containing counts of unique rows in the DataFrame. </br>
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html#pandas.DataFrame.value_counts
*** What is a Series in Pandas? </br>
A pandas Series is a one-dimensional array. It holds any data type supported in Python and uses labels to locate each data value for retrieval.


In [20]:
# df.columns --> select from the DataFrame a column that needs analysis on unique counts
df_val = df.value_counts('범죄대분류')

### 7) Boolean Indexing
ex) pandas.DataFrame[DataFrame[column_name]==column_value] </br>
--> Used to fileter dataframe via selecting columns containing certain value w/ boolean </br>
--> ==, !=, >=, <=, >, </br>
--> and, or, not ; &, |, ~

In [22]:
df_violence = df[df['범죄대분류']=='폭력범죄']
df_violence

Unnamed: 0,범죄대분류,범죄중분류,범죄소분류,발생,검거,검거인원(남),검거인원(여),불상,법인체
112,폭력범죄,상해,상해,27826,27064,29379,5849,109,0
113,폭력범죄,상해,상해(중),43,38,40,4,0,0
114,폭력범죄,상해,상해(상습),17,16,17,5,0,0
115,폭력범죄,상해,상해(상습중),0,0,0,0,0,0
116,폭력범죄,상해,상해치사,35,33,35,3,0,0
...,...,...,...,...,...,...,...,...,...
196,폭력범죄,공갈,특경법(공갈),9,10,34,10,0,0
197,폭력범죄,공갈,아동학대처벌법위반(상습공갈),0,0,0,0,0,0
198,폭력범죄,공갈,특수공갈,85,79,174,22,6,0
199,폭력범죄,손괴,손괴의죄,54188,33312,30406,5789,386,9


In [24]:
len(df_violence)

89

### 8) pandas.DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwargs)
Apply a function along an axis of the DataFrame.