## import modules

In [1]:
# 데이터분석 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import NaN as NA

# 시각화 
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
%matplotlib inline 
import seaborn as sns 
sns.set(font="Malgun Gothic", #Malgun Gothic 
        rc={"axes.unicode_minus":False},
        style='whitegrid')

# warning ignore 
import warnings
warnings.filterwarnings('ignore') 

## 데이터 불러오기 

In [2]:
# 전체 데이터 
df = pd.read_csv('temp_data.csv')
df

Unnamed: 0,police,year,범죄발생_절도,범죄발생_폭력,범죄발생_강도,범죄발생_살인,범죄발생_강간/강제추행,범죄검거_강간/강제추행,범죄검거_절도,범죄검거_폭력,...,1인가구_30대_여자,1인가구_40대_여자,1인가구_50대_여자,1인가구_60대이상_여자,1인가구합계_여자,rob_satety_q1,mur_safety_q2,ta_safety_q3,raw_odder_q4,overall_q5
0,마산동부경찰서,0,61,91,0,0,5,5,36,82,...,820,1156,2124,5507,10347,74.6,77.0,64.0,65.7,71.2
1,마산동부경찰서,1,72,102,0,0,6,5,36,82,...,791,1123,2070,5614,10338,71.7,73.5,64.9,63.6,70.3
2,마산동부경찰서,2,57,90,0,0,5,4,40,78,...,762,1091,2016,5721,10330,76.0,79.0,67.5,70.0,72.6
3,마산동부경찰서,3,72,107,0,0,6,6,33,93,...,745,1047,1976,5943,10486,72.4,75.6,67.9,70.7,71.8
4,마산동부경찰서,4,67,97,0,0,7,6,35,82,...,729,1003,1936,6166,10643,76.1,78.2,66.4,69.9,72.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,창원중부경찰서,2,79,112,1,0,8,6,56,107,...,2326,2414,3059,5247,15151,78.0,80.1,65.3,68.6,72.9
283,창원중부경찰서,3,82,120,1,0,6,6,52,103,...,2316,2425,3124,5470,15552,75.7,77.7,67.8,70.6,74.2
284,창원중부경찰서,4,93,111,0,0,8,7,67,99,...,2305,2437,3190,5695,15954,80.8,82.8,69.4,73.8,76.1
285,창원중부경찰서,5,49,88,0,0,7,11,66,109,...,2336,2463,3220,6051,16576,77.6,79.6,70.9,73.6,74.1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 63 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   police           287 non-null    object 
 1   year             287 non-null    int64  
 2   범죄발생_절도          287 non-null    int64  
 3   범죄발생_폭력          287 non-null    int64  
 4   범죄발생_강도          287 non-null    int64  
 5   범죄발생_살인          287 non-null    int64  
 6   범죄발생_강간/강제추행     287 non-null    int64  
 7   범죄검거_강간/강제추행     287 non-null    int64  
 8   범죄검거_절도          287 non-null    int64  
 9   범죄검거_폭력          287 non-null    int64  
 10  범죄검거_강도          287 non-null    int64  
 11  범죄검거_살인          287 non-null    int64  
 12  acc_case         287 non-null    int64  
 13  경찰서 수            287 non-null    int64  
 14  경찰 수             287 non-null    int64  
 15  면적               287 non-null    float64
 16  경찰서 수/면적         287 non-null    float64
 17  경찰 수/면적         


## cctv/보안등수 자료 수정 
- 몇몇 지역 경찰서에 누락된 cctv수와 보안등수 자료 보충 

In [4]:
# cctv수 
df.loc[df[df.police.str.contains('강동')].index, 'cctv수'] = 2543
df.loc[df[df.police.str.contains('송파')].index, 'cctv수'] = 2732
df.loc[df[df.police.str.contains('도봉')].index, 'cctv수'] = 1090
df.loc[df[df.police.str.contains('동대문')].index, 'cctv수'] = 2101



In [5]:
# 보안등수 
df.loc[df[df.police.str.contains('양천')].index, '보안등수'] =  7367
df.loc[df[df.police.str.contains('송파')].index, '보안등수'] = 8816
df.loc[df[df.police.str.contains('동대문')].index, '보안등수'] = 10729




## 비율 변수 추가 
- raw data를 인구적/지리적 데이터로 더 균등하도록 비율 변수들을 추가해준다. 
        - 예) 전체인구에서 남성비율, 인구천명당 경찰서수, 지역 면적당 cctv수 

In [6]:
# 비율 변수 추가 

# 인구 데이터 
df['남성비율'] = df.popu_male_num / df.popu_num

df['1인_남성비율'] = df[ '1인가구합계_남자'] / df['1인가구합계_계']
df['1인_230대여성_비율_총'] = (df['1인가구_20대_여자']+df['1인가구_30대_여자']) / df['popu_female_num'] * 100 
df['1인_1230비율'] = (df['1인가구_10대_계'] + df['1인가구_20대_계'] + df[ '1인가구_30대_계']  )/df['1인가구합계_계']
df['1인_450비율'] = (df['1인가구_40대_계'] + df['1인가구_50대_계']  )/df['1인가구합계_계']
df['1인_60이상비율'] = df['1인가구_60대이상_계']/df['1인가구합계_계']

df['외국인_남성비율'] = df['외국인인구_남자']/df['외국인인구_계']
df['외국인_전체비율'] = df['외국인인구_계'] /df['popu_num']
df['기초생활수급_비율'] = df['기초생활수급자'] / df['popu_num']


# 경찰 데이터 

df['범죄발생_절도폭력'] = df['범죄발생_절도'] + df['범죄발생_폭력']
df['범죄검거_절도폭력'] = df['범죄검거_절도'] + df['범죄검거_폭력']
df['112신고_절도폭력'] = df['112신고_폭력'] + df['112신고_절도']

df['천명당_경찰서수'] = df['경찰서 수/면적']*df['면적'] / df['popu_num'] * 1000
df['천명당_경찰수'] = df['경찰 수'] / df['popu_num'] * 1000
df['천명당_범죄발생_절도폭력'] = df['범죄발생_절도폭력']/df['popu_num']*1000
df['천명당_범죄발생_강간추행'] = df['범죄발생_강간/강제추행']/df['popu_num']*1000
df['천명당_범죄발생_강도살인'] = (df['범죄발생_강도']+df['범죄발생_살인'])/df['popu_num']*1000
df['천명당_교통사고수'] = df['acc_case']/df['popu_num']*1000

df['천명당_신고_절도폭력'] = df['112신고_절도폭력']/df['popu_num']*1000
df['천명당_신고_교통불편'] = df['112신고_교통불편']/df['popu_num']*1000
df['천명당_신고_교통사고'] = df['112신고_교통사고']/df['popu_num']*1000
df['천명당_신고_교통위반'] = df['112신고_교통위반'] /df['popu_num']*1000
df['천명당_신고_법질서'] = df['112신고_법질서']/df['popu_num']*1000

df['검거/발생_절도폭력'] = df['범죄검거_절도폭력']/df['범죄발생_절도폭력']
df['검거/발생_강간추행'] = df['범죄검거_강간/강제추행']/df['범죄발생_강간/강제추행']


# 지리 데이터 
df['천명당_유흥업소수'] = df['유흥업소수'] /df['popu_num']*1000
df['천명당_cctv수'] = df['cctv수']/df['popu_num']*1000
df['천명당_보안등수'] = df['보안등수']/df['popu_num']*1000
df['천명당_공원수'] = df['공원현황']/df['popu_num']*1000

df['면적당_공원수'] = df['공원현황']/df['면적']
df['면적당_유흥업소수'] = df['유흥업소수']/df['면적']
df['면적당_보안등수'] = df['보안등수']/df['면적']
df['면적당_cctv수'] = df['cctv수']/df['면적']





In [7]:
# 사용하지 않는 변수들은 지워준다 
df.drop(columns=['112신고_절도', '112신고_폭력', '경찰서 수', '범죄검거_절도', '범죄검거_폭력', '범죄발생_절도', '범죄발생_폭력'], inplace=True) 


In [8]:
# 컬럼 순서 정리 
cl = df.columns.to_list()
ans = ['rob_satety_q1', 'mur_safety_q2',
       'ta_safety_q3', 'raw_odder_q4', 'overall_q5']
for i in ans:
    cl.remove(i)
df = df[cl + ans]
df

Unnamed: 0,police,year,범죄발생_강도,범죄발생_살인,범죄발생_강간/강제추행,범죄검거_강간/강제추행,범죄검거_강도,범죄검거_살인,acc_case,경찰 수,...,천명당_공원수,면적당_공원수,면적당_유흥업소수,면적당_보안등수,면적당_cctv수,rob_satety_q1,mur_safety_q2,ta_safety_q3,raw_odder_q4,overall_q5
0,마산동부경찰서,0,0,0,5,5,0,0,30,185,...,0.320412,0.703934,3.562987,61.155591,6.313743,74.6,77.0,64.0,65.7,71.2
1,마산동부경찰서,1,0,0,6,5,0,0,36,185,...,0.324557,0.703934,3.573817,61.155591,7.472527,71.7,73.5,64.9,63.6,70.3
2,마산동부경찰서,2,0,0,5,4,0,0,34,185,...,0.328809,0.703934,3.584647,61.155591,9.313584,76.0,79.0,67.5,70.0,72.6
3,마산동부경찰서,3,0,0,6,6,0,0,31,185,...,0.333224,0.703934,3.584647,61.155591,9.941709,72.4,75.6,67.9,70.7,71.8
4,마산동부경찰서,4,0,0,7,6,0,0,36,185,...,0.337757,0.703934,3.595477,61.155591,11.609491,76.1,78.2,66.4,69.9,72.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,창원중부경찰서,2,1,0,8,6,1,0,56,201,...,0.386518,1.416245,5.748288,30.147271,9.309727,78.0,80.1,65.3,68.6,72.9
283,창원중부경찰서,3,1,0,6,6,1,0,58,201,...,0.388677,1.416245,5.789942,30.147271,10.205294,75.7,77.7,67.8,70.6,74.2
284,창원중부경찰서,4,0,0,8,7,0,0,52,201,...,0.390861,1.416245,5.800356,30.147271,13.308536,80.8,82.8,69.4,73.8,76.1
285,창원중부경찰서,5,0,0,7,11,0,0,56,201,...,0.392087,1.416245,5.852424,30.147271,14.349893,77.6,79.6,70.9,73.6,74.1


## 변수 이름 바꾸기 
- 변수이름에 오타를 수정해준다 

In [9]:
df.rename(columns={'rob_satety_q1':'rob_safety_q1',
       'raw_odder_q4':'law_order_q4'}, inplace=True)

## 1719/20 나누기 

In [10]:
data = df[df.year<6]
data20 = df[df.year==6]

# 저장 

In [11]:
# df.to_csv('전체데이터_독립+종속_1720.csv', encoding='utf-8-sig', header=True, index=False)
# data20.to_csv('전체데이터_독립+종속_20.csv', encoding='utf-8-sig', header=True, index=False)
# data.to_csv('전체데이터_독립+종속_1719.csv', encoding='utf-8-sig', header=True, index=False)
# data.iloc[:, -5:].to_csv('전체데이터_종속_1719.csv', encoding='utf-8-sig', header=True, index=False)
