In [40]:
import pandas
from numpy import nan, histogram

In [53]:
df = pandas.read_csv('auto-mpg.csv', header=None) 
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
              'model year', 'origin', 'name']

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [54]:
df['horsepower'].unique() # 고유값을 보고 이상값을 찾아봄

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

## 자료형 구분할때 유용한 방법 ! (예외처리구문)

In [55]:
for x in df['horsepower'].unique():
    try:
        float(x)
    except:
        print(x)

?


## ?로 들어간값을 nan값으로 우선 바꿔놓고

In [56]:
df['horsepower'].replace('?',nan,inplace=True)

In [57]:
df['horsepower'].unique() # 잘 바뀌었는지 확인

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', nan, '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

## nan값을 지운다

In [58]:
df.dropna(subset=['horsepower'],axis=0,inplace=True)
df.info() # 날라갔는지 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB


## 자료형을 문자에서 실수형으로 바꿔준다

In [59]:
df['horsepower'] = df['horsepower'].astype('float')
df['horsepower'].dtypes

dtype('float64')

## 컬럼 origin의 데이터타입과 고유값 확인

In [60]:
df['origin'].dtypes
df['origin'].unique()

array([1, 3, 2], dtype=int64)

## 자료형을 한번에 바꿔줄때 dict 타입 사용 가능

In [61]:
df['origin'].replace({1:'USA',2:'EU',3:'JPN'},inplace=True)
df['origin'].unique()

array(['USA', 'JPN', 'EU'], dtype=object)

## 분석의 용이함을 위해 카테고리 타입으로 바꿔주자

In [62]:
df['origin'].astype('category')

0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: category
Categories (3, object): ['EU', 'JPN', 'USA']

## model year 컬럼을 19XX 형태로 만들고 범주형으로 바꾸기

In [63]:
df['model year'] = df['model year'] + 1900

In [64]:
df['model year'].sample(3)

73     1972
254    1978
4      1970
Name: model year, dtype: int64

In [65]:
df['model year'].astype('category')

0      1970
1      1970
2      1970
3      1970
4      1970
       ... 
393    1982
394    1982
395    1982
396    1982
397    1982
Name: model year, Length: 392, dtype: category
Categories (13, int64): [1970, 1971, 1972, 1973, ..., 1979, 1980, 1981, 1982]

## horsepower 범주형 컬럼을 구간 분할을 해보자

In [66]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,1975.979592
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,1970.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,1973.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,1976.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,1979.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,1982.0


## 범주를 나눌 구간을 알아보자

In [67]:
count,bin_dividers = histogram(df['horsepower'],bins=3) # 3개의 bin으로 구분하다.

## 3개의 범주로 나눈다

In [68]:
# 3개 이름 지정
bin_names = ['저출력','보통출력','고출력']

# pd.cut 함수로 각 데이터를 3개의 bin에 할당
hp = pandas.cut(x = df['horsepower'],  # 데이터배열
                bins = bin_dividers,    # 경계값 리스트
                labels = bin_names,     # bin 이름
                include_lowest = True) # 첫 경계값 포함

# df 4번째 칼럼으로 넣기!
df.insert(4, 'hp_bin', hp)

## 더미변수(0과 1로만 표현)
- 특성이 있고 없는지 판별
- 컴퓨터가 인식할 수 있는 0과1 로만 표현 , 원핫인코딩
- 특히, 범주형 데이터는 더미변수를 통해 머신러닝에서 이해가능한 자료형으로 바꿔줘야함

In [77]:
df1 = pandas.get_dummies(df['hp_bin']) # 더미변수 만들기

df2 = df[['hp_bin']] # 한개의 컬럼짜리 데이터프레임(2차원으로)

# 두개의 데이터 프레임 결합할때, concat !!
pandas.concat( [df[['hp_bin']], pandas.get_dummies(df['hp_bin'])], axis=1)

Unnamed: 0,hp_bin,저출력,보통출력,고출력
0,보통출력,0,1,0
1,보통출력,0,1,0
2,보통출력,0,1,0
3,보통출력,0,1,0
4,보통출력,0,1,0
...,...,...,...,...
393,저출력,1,0,0
394,저출력,1,0,0
395,저출력,1,0,0
396,저출력,1,0,0
