# 파일 불러오기

In [1]:
import pandas as pd

df1 = pd.read_csv("data1.csv")
df2 = pd.read_csv("data2.csv")

df = pd.merge(df1,df2,how = 'inner',on='ID')  #ID기준으로 합치기
df.drop(['Date_x','Date_y','ID'], axis = 1, inplace = True)
df

Unnamed: 0,Gender,Age,Shift,Injury Location,Incident Type,Days Lost,Incident Cost
0,Male,25,Night,Trunk,Burn,0.0,"$5,000"
1,Female,18,Day,Abdomen,Cut,0.0,"$4,994"
2,Male,35,Day,Back,Lifting,5.0,"$4,969"
3,Female,50,Day,Legs,Lifting,2.5,4947
4,Male,25,Day,Abdomen,Lifting,3.0,"$4,940"
...,...,...,...,...,...,...,...
509,Female,31,Night,Back,Lifting,0.0,0
510,Male,31,Night,Back,Slip/trip,0.0,0
511,Male,22,Night,Abdomen,Crush & Pinch,0.0,0
512,Male,31,Night,Hands,Lifting,0.0,0


# 전처리

In [2]:
#결측치 제거
df.dropna(inplace = True) 
#특수문자 제거
df['Incident Cost'] = df['Incident Cost'].str.replace(pat =r'[^\w]', repl=r'', regex=True ) 
df

Unnamed: 0,Gender,Age,Shift,Injury Location,Incident Type,Days Lost,Incident Cost
0,Male,25,Night,Trunk,Burn,0.0,5000
1,Female,18,Day,Abdomen,Cut,0.0,4994
2,Male,35,Day,Back,Lifting,5.0,4969
3,Female,50,Day,Legs,Lifting,2.5,4947
4,Male,25,Day,Abdomen,Lifting,3.0,4940
...,...,...,...,...,...,...,...
509,Female,31,Night,Back,Lifting,0.0,0
510,Male,31,Night,Back,Slip/trip,0.0,0
511,Male,22,Night,Abdomen,Crush & Pinch,0.0,0
512,Male,31,Night,Hands,Lifting,0.0,0


In [3]:
#연령대별로 묶기
import numpy as np
bins = [0,30,40,50]

df['Age_grouping'] = np.digitize(df['Age'], bins)

In [4]:
df['Injury Location'].unique()

array(['Trunk', 'Abdomen', 'Back', 'Legs', 'Neck', 'Head', 'Multiple',
       'Eye', 'Arms', 'Hands', 'Feet'], dtype=object)

In [5]:
df['Injury Location'].value_counts()

Back        51
Head        49
Feet        49
Legs        46
Multiple    45
Arms        44
Trunk       40
Eye         40
Abdomen     39
Hands       39
Neck        30
Name: Injury Location, dtype: int64

In [6]:
#Injury Location 묶어서 라벨링
map_Injury = {'Trunk' : 0, 'Abdomen' : 0, 'Back' : 0, 'Legs' : 1, 'Neck':2, 'Head' : 2, 'Multiple':3,
        'Eye':3, 'Arms': 4, 'Hands' : 4, 'Feet' : 1}
df['Injury Location'] = df['Injury Location'].map(map_Injury)
df

Unnamed: 0,Gender,Age,Shift,Injury Location,Incident Type,Days Lost,Incident Cost,Age_grouping
0,Male,25,Night,0,Burn,0.0,5000,1
1,Female,18,Day,0,Cut,0.0,4994,1
2,Male,35,Day,0,Lifting,5.0,4969,2
3,Female,50,Day,1,Lifting,2.5,4947,4
4,Male,25,Day,0,Lifting,3.0,4940,1
...,...,...,...,...,...,...,...,...
509,Female,31,Night,0,Lifting,0.0,0,2
510,Male,31,Night,0,Slip/trip,0.0,0,2
511,Male,22,Night,0,Crush & Pinch,0.0,0,1
512,Male,31,Night,4,Lifting,0.0,0,2


In [7]:
#DaysLost, IncidentCost 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['scaled_DaysLost','scaled_IncidentCost']] = scaler.fit_transform(df[['Days Lost','Incident Cost']])

df

Unnamed: 0,Gender,Age,Shift,Injury Location,Incident Type,Days Lost,Incident Cost,Age_grouping,scaled_DaysLost,scaled_IncidentCost
0,Male,25,Night,0,Burn,0.0,5000,1,0.0,1.0000
1,Female,18,Day,0,Cut,0.0,4994,1,0.0,0.9988
2,Male,35,Day,0,Lifting,5.0,4969,2,1.0,0.9938
3,Female,50,Day,1,Lifting,2.5,4947,4,0.5,0.9894
4,Male,25,Day,0,Lifting,3.0,4940,1,0.6,0.9880
...,...,...,...,...,...,...,...,...,...,...
509,Female,31,Night,0,Lifting,0.0,0,2,0.0,0.0000
510,Male,31,Night,0,Slip/trip,0.0,0,2,0.0,0.0000
511,Male,22,Night,0,Crush & Pinch,0.0,0,1,0.0,0.0000
512,Male,31,Night,4,Lifting,0.0,0,2,0.0,0.0000


In [8]:
# 불필요한 column 삭제
df.drop(['Age','Days Lost','Incident Cost'], axis = 1, inplace = True)
df

Unnamed: 0,Gender,Shift,Injury Location,Incident Type,Age_grouping,scaled_DaysLost,scaled_IncidentCost
0,Male,Night,0,Burn,1,0.0,1.0000
1,Female,Day,0,Cut,1,0.0,0.9988
2,Male,Day,0,Lifting,2,1.0,0.9938
3,Female,Day,1,Lifting,4,0.5,0.9894
4,Male,Day,0,Lifting,1,0.6,0.9880
...,...,...,...,...,...,...,...
509,Female,Night,0,Lifting,2,0.0,0.0000
510,Male,Night,0,Slip/trip,2,0.0,0.0000
511,Male,Night,0,Crush & Pinch,1,0.0,0.0000
512,Male,Night,4,Lifting,2,0.0,0.0000


In [9]:
#map_Injury = {'Trunk' : 0, 'Abdomen' : 1, 'Back' : 2, 'Legs' : 3, 'Neck':4, 'Head' : 5, 'Multiple':6,
      # 'Eye':7, 'Arms': 8, 'Hands' : 9, 'Feet' : 10}
#df['target'] = df['Injury Location'].map(map_Injury)
#df

In [10]:
#Gender, Shift 라벨링, Incident Type 원핫인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Shift'] = le.fit_transform(df['Shift'])
df=pd.get_dummies(df, columns = ['Incident Type'])
#df=pd.get_dummies(df, columns = ['Injury Location'])
df

Unnamed: 0,Gender,Shift,Injury Location,Age_grouping,scaled_DaysLost,scaled_IncidentCost,Incident Type_Burn,Incident Type_Crush & Pinch,Incident Type_Cut,Incident Type_Equipment,Incident Type_Fall,Incident Type_Falling object,Incident Type_Lifting,Incident Type_Slip/trip,Incident Type_Vehicle
0,1,1,0,1,0.0,1.0000,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0.0,0.9988,0,0,1,0,0,0,0,0,0
2,1,0,0,2,1.0,0.9938,0,0,0,0,0,0,1,0,0
3,0,0,1,4,0.5,0.9894,0,0,0,0,0,0,1,0,0
4,1,0,0,1,0.6,0.9880,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,0,1,0,2,0.0,0.0000,0,0,0,0,0,0,1,0,0
510,1,1,0,2,0.0,0.0000,0,0,0,0,0,0,0,1,0
511,1,1,0,1,0.0,0.0000,0,1,0,0,0,0,0,0,0
512,1,1,4,2,0.0,0.0000,0,0,0,0,0,0,1,0,0


In [11]:
df.columns

Index(['Gender', 'Shift', 'Injury Location', 'Age_grouping', 'scaled_DaysLost',
       'scaled_IncidentCost', 'Incident Type_Burn',
       'Incident Type_Crush & Pinch', 'Incident Type_Cut',
       'Incident Type_Equipment', 'Incident Type_Fall',
       'Incident Type_Falling object', 'Incident Type_Lifting',
       'Incident Type_Slip/trip', 'Incident Type_Vehicle'],
      dtype='object')

# csv저장

In [12]:
df.to_csv('data.csv')