### Import Common Modules

In [1]:
import import_ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, roc_auc_score, \
                                precision_score, f1_score, recall_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import classification_report

import matplotlib as mpl
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False

pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)

from traffic_common import get_category_age, get_category_season, get_category_time, drop_features, cleansing, \
                            encode_features, conv2XYarr, transform_dataframe, bar_chart, pie_chart

importing Jupyter notebook from traffic_common.ipynb


### Load Dataset and Pre-processing

In [2]:
traffic_df = pd.read_csv('dataset/seoul_traffic.csv', encoding='euc-kr')

In [3]:
traffic_df = transform_dataframe(traffic_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
traffic_df.head()

Unnamed: 0,acc_details,occur_date,occur_time,occur_dayofweek,acc_type_b,offender_violation,road_condition_b,wheather_status,road_type_b,offender_vehicle,offender_sex,offender_age,victim_vehicle,victim_sex,victim_age
0,경상,Winter,Dawn,금,기타,기타,젖음/습기,맑음,기타단일로,승용차,남,중년,이륜차,남,미성년
1,중상,Winter,Dawn,금,추돌,안전거리 미확보,서리/결빙,맑음,기타단일로,승합차,남,중년,승용차,남,장년
2,경상,Winter,Dawn,금,추돌,기타,서리/결빙,흐림,기타단일로,승용차,남,중년,승용차,남,중년
3,경상,Winter,Dawn,금,기타,안전거리 미확보,건조,맑음,교차로부근,승용차,남,중년,승용차,남,중년
4,경상,Winter,Dawn,금,정면충돌,신호위반,건조,맑음,교차로내,승용차,남,청년,승용차,남,중년


### Hypothesis Test [Chi-Square]

In [5]:
import scipy.stats as stats

In [6]:
def chisquare(x, y):
    result = pd.crosstab(x, y)
    stat = stats.chi2_contingency(observed = result)
    print(x.name, ' -> ', y.name)
    print('Chi-Square Statistic: {}, p-value: {}\n'.format(stat[0], stat[1]))

In [7]:
for i in range(len(traffic_df.columns)-1):
    chisquare(traffic_df.iloc[:,i+1], traffic_df['acc_details'])

occur_date  ->  acc_details
Chi-Square Statistic: 27.96606642908225, p-value: 3.692092143863781e-06

occur_time  ->  acc_details
Chi-Square Statistic: 87.59522912538839, p-value: 9.526594391484761e-20

occur_dayofweek  ->  acc_details
Chi-Square Statistic: 30.3162809795288, p-value: 3.4222854791609286e-05

acc_type_b  ->  acc_details
Chi-Square Statistic: 12146.47749855585, p-value: 0.0

offender_violation  ->  acc_details
Chi-Square Statistic: 6018.040429066616, p-value: 0.0

road_condition_b  ->  acc_details
Chi-Square Statistic: 57.541566207608504, p-value: 1.9692392076530776e-12

wheather_status  ->  acc_details
Chi-Square Statistic: 77.94896573767522, p-value: 8.451045934525957e-17

road_type_b  ->  acc_details
Chi-Square Statistic: 2749.032679895525, p-value: 0.0

offender_vehicle  ->  acc_details
Chi-Square Statistic: 1418.3680913581936, p-value: 4.093878918069799e-302

offender_sex  ->  acc_details
Chi-Square Statistic: 8.009410809656202, p-value: 0.004653487579489117

offender