In [50]:
import xml.etree.ElementTree as ET

In [51]:
import pandas as pd

In [52]:
tree=ET.parse('bankrupt_cases.xml')

In [53]:
root=tree.getroot() # 최상위태그 파싱

In [54]:
# 단위를 입력받아서 무슨 단위가 있는지 확인

a_units=[] # asset 단위
l_units=[] # liabilities 단위
for x in root:
    a_units.append(x.find('assets').attrib['unit'])
    l_units.append(x.find('liabilities').attrib['unit'])

a_unitset=set(a_units)
l_unitset=set(l_units)
print(a_unitset)
print(l_unitset)
# 결과 : assets과 liaibilites 단위는 million밖에 없음

{'millions'}
{'millions'}


In [55]:
df_cols=["district","state","company_name","assets","assets unit","liabilities","liabilities unit"]

rows=[]

for node in root:
    try:
        s_assets=float(node.find('assets').text)
    except:
        s_assets=None
        
    try:
        s_liabilities=float(node.find('liabilities').text)
    except:
        s_liabilities=None
    
    s_district=node.find('district').text
    s_state=node.find('state').text
    s_company_name=node.find('company_name').text
    u_assets=node.find('assets').attrib['unit']
    u_liabilities=node.find('liabilities').attrib['unit']
    rows.append({'district':s_district,'state':s_state,'company_name':s_company_name,'assets':s_assets,'assets unit':u_assets,
                 'liabilities':s_liabilities,'liabilities unit':u_liabilities})
    
out_df=pd.DataFrame(rows,columns=df_cols)
out_df

Unnamed: 0,district,state,company_name,assets,assets unit,liabilities,liabilities unit
0,D,NV,"Ad Systems Communications, Inc.",404.586,millions,3.772,millions
1,D,NV,Alphatrade.com,685.694,millions,3.684,millions
2,SD,NY,AMBAC Financial Group,394.500,millions,1682.600,millions
3,D,DE,"Ambassdors International , Inc.",86.441,millions,87.315,millions
4,D,NV,American Pacific Financial Corporation,19.175,millions,161.084,millions
...,...,...,...,...,...,...,...
57,D,DE,"Trico Marine Services, Inc.",30.563,millions,353.606,millions
58,D,DE,Vaso Active Pharmaceuticals,645.000,millions,10.187,millions
59,D,NJ,Visual Management Systems,273.000,millions,12.249,millions
60,D,DE,"Waste2Energy Holdings, Inc.",6.000,millions,17.000,millions


In [56]:
# 요약정보 출력
out_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   district          62 non-null     object 
 1   state             62 non-null     object 
 2   company_name      62 non-null     object 
 3   assets            58 non-null     float64
 4   assets unit       62 non-null     object 
 5   liabilities       61 non-null     float64
 6   liabilities unit  62 non-null     object 
dtypes: float64(2), object(5)
memory usage: 3.5+ KB


In [57]:
# 범주형 데이터 개수 확인

for col in ['state','district']:
    print('*'*3,col,'*'*3)
    print(out_df[col].value_counts(dropna=False))
    print('\n')

*** state ***
DE     17
NY     11
NV      6
CA      6
TX      4
FL      3
MA      2
NJ      2
GA      2
NC      1
CA      1
VA      1
NE      1
WA      1
AZ      1
KS      1
DFL     1
KY      1
Name: state, dtype: int64


*** district ***
D     30
SD    16
ND     5
ED     5
CD     3
MD     2
WD     1
Name: district, dtype: int64




In [58]:
# 각 열마다 누락데이터 개수
out_df.isnull().sum(axis=0)

district            0
state               0
company_name        0
assets              4
assets unit         0
liabilities         1
liabilities unit    0
dtype: int64

In [61]:
# 누락데이터 처리
# asset 열은 누락데이터 개수 4개이므로 데이터 손실이 우려되 평균값으로 대체한다.
# liabilities 열은 누락데이터 개수 1개이므로 삭제한다.(행 삭제)

df_lia=out_df.dropna(subset=['liabilities'],how='any').copy()
len(df_lia)



61

In [62]:
# asset 열 누락 데이터 처리
mean_asset=df_lia['assets'].mean()
df_lia['assets'].fillna(mean_asset,inplace=True)

In [63]:
# 누락데이터 처리가 잘됐는지 검토
df_lia['liabilities'].value_counts(dropna=False)

2.345        1
39700.000    1
1.142        1
353.606      1
270.000      1
            ..
749.000      1
10.465       1
2.069        1
237.548      1
6.925        1
Name: liabilities, Length: 61, dtype: int64

In [64]:
df_lia.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 61
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   district          61 non-null     object 
 1   state             61 non-null     object 
 2   company_name      61 non-null     object 
 3   assets            61 non-null     float64
 4   assets unit       61 non-null     object 
 5   liabilities       61 non-null     float64
 6   liabilities unit  61 non-null     object 
dtypes: float64(2), object(5)
memory usage: 3.8+ KB


In [65]:
df_lia['assets'].value_counts(dropna=False)

1013.262172     3
14.660000       1
21.031000       1
377.383000      1
41.839000       1
37.392000       1
10.254000       1
133.033000      1
19.175000       1
5.856000        1
6.000000        1
500.000000      1
1.496000        1
326.008000      1
12.448000       1
1275.431000     1
310.773000      1
1017.036000     1
645.000000      1
424.470000      1
2.084000        1
86.441000       1
304.000000      1
16.374000       1
36.453000       1
1536.176000     1
1.771000        1
1.064000        1
2.080000        1
41000.000000    1
65.561000       1
650.760000      1
119.791000      1
2531.032000     1
321.263000      1
92.025000       1
19.836000       1
524.722000      1
9.800000        1
504.897000      1
859.000000      1
685.694000      1
404.586000      1
10.000000       1
418.000000      1
273.000000      1
9.757000        1
388.000000      1
527.433000      1
115.624000      1
13.600000       1
436.191000      1
873.252000      1
20.716000       1
394.500000      1
30.563000 