# 머신러닝 필수 라이브러리

1. Numpy : Nummerical Python의 줄임
- 파이썬 산술계싼의 대표적인 라이브러리
- 자료구조, 알고리즘 산술 데이터를 다루는 대부분의 과학 계산에 필수 라이브러리
- ndarray 객체

2. pandas : 구조화된 데이터나 표 형식의 데이터를 빠르고 쉽게 표현적으로 다루도록 설계된 고수준 자료구조
- 데이터 과학에서 데이터를 처리하는 대표적인 라이브러리
- 데이터 핸들링에 표준
- series 객체와 dataFrame 객체가 대표적인 자료구조

3. matplotlib : 시각화 라이브러리
- 그래프나 2차원 데이터를 시각화하는 파이썬 기반의 라이브러리

4. Seaborm : 시본, 다양한 시각화 종류를 제공하는 라이브러리
- matplotlib에 종속된 라이브러리

5. scipy  : 사이파이
- 과학 계산 컴퓨팅 영역의 여러 기본 문제를 다루는 패키지 모음
- scipy.stats : 가장 많이 사용되는 통계도구를 가지고 있는 라이브러리

6. scikit-learn : 머신러닝에 핵심 라이브러리
- 분류 : SVM, 최근접 이웃, 랜덤 포레스트, 로지스틱 회귀 등
- 회귀 : 라쏘, 릿지 회귀 등
- 클러스터링 : K-평균 등
- 차원축소 : PCA, 특징 선택, 행렬 인수분해 등
- 모델 선택 : 격자 탐색, 교차검증, 행렬
- 전처리 : 특징 추출, 정규화 등

7. statsmodels : R 언어용 회귀분석 모델을 구현한 통계분석 패키지
- 회귀모델 : 선형회귀
- 분산분석 : (ANOVA)
- 시계열분석 : AR, ARMA, ARIMA 등
- 통계 모델 결과의 시각화 제공

# 앙상블 학습

In [3]:
import numpy as np
import pandas as pd

# 보팅 분류기
from sklearn.ensemble import VotingClassifier
# 보팅용 학습 알고리즘
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# 테스팅 쪼개기
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [5]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
df = pd.DataFrame(cancer.data, columns = cancer.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
df.shape

(569, 30)

In [8]:
logistic_regression =  LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=5)

voting_model = VotingClassifier(estimators=[('LogisticRegression', logistic_regression),('KNN', knn)], voting='soft')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,test_size=0.2,random_state=156)

In [13]:
voting_model.fit(X_train, y_train)
pred = voting_model.predict(X_test)

In [15]:
print('보팅 분류기 정확도 : {:.3f}'.format(accuracy_score(y_test, pred)))

보팅 분류기 정확도 : 0.947


In [17]:
# 개별 모델의 학습과 예측 그리고 평가
classifier = [logistic_regression, knn]

for model in classifier:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    model_name = model.__class__.__name__
    print('{}정확도 : {:.3f}'.format(model_name, accuracy_score(y_test, pred)))

LogisticRegression정확도 : 0.939
KNeighborsClassifier정확도 : 0.904


# 랜덤포레스트

In [18]:
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')

In [19]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [21]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [22]:
train_input, test_input, train_target, test_target = train_test_split(
data, target, test_size = 0.2, random_state = 42)

In [25]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs = -1, random_state = 42)
scores = cross_validate(rf, train_input, train_target, return_train_score = True)

print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9973541965122431 0.8905151032797809


In [26]:
rf.fit(train_input, train_target)

print(rf.feature_importances_)

[0.23167441 0.50039841 0.26792718]


In [27]:
# OOB 샘플  : 부트스트랩 샘플에 포함되지 않고 남는 샘플
rf = RandomForestClassifier(oob_score =True, n_jobs = -1, random_state =42)
rf.fit(train_input, train_target)

print(rf.oob_score_)

0.8934000384837406


# 실전예제 : 중고차 가격 예측

1. 알고리즘 : 랜덤포레스트
2. 데이터셋 : 해외 중고차 거래 데이터셋 이용
3. 데이터 셋의 소개 : 종속변수(selling-price ), 독립변수()
- 중고차의 판매이력을 수집한 데이터 세트
4. 문제유형 : 회귀 
5. 평가지표 : RMSE
6. 사용할 모델 : RandomForestRegressor
7. 사용할 라이브러리 : 

In [28]:
data = pd.read_csv('https://media.githubusercontent.com/media/musthave-ML10/data_source/main/car.csv')

In [29]:
data.head(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0


1. feature 탐색
- name : 차종
- year : 년식
- selling-price : 판매가
- km_driven : 주행거리(km)
- fuel : 연료
- seller type : 판매자 유형
- transmission : 변속기
- owner : 소유자 이력
- mileage : 연비(km)
- engine : 배기량
- max_power : 최대출력(제동마력)
- torque : 회전력(타이어를 회전시키는 힘)
- seats : 좌석수(인승)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [31]:
data.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


## 전처리 : 텍스트 데이터

- split() : 문자열 분리 

## engine

In [37]:
data[['engine', 'engine_unit']] = data['engine'].str.split(expand = True)

ValueError: Columns must be same length as key

In [38]:
data['engine'] = data['engine'].astype(float)

In [39]:
data['engine'].head()

0    1248.0
1    1498.0
2    1497.0
3    1396.0
4    1298.0
Name: engine, dtype: float64

In [42]:
data[['mileage', 'mileage_unit']] = data['mileage'].str.split(expand=True)

## mileage/

In [43]:
data['mileage'] = data['mileage'].astype(float)

In [44]:
data['mileage_unit'].unique()

array(['kmpl', 'km/kg', nan], dtype=object)

In [45]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

- 연료종류가 4종류
- 다른 종류의 연료로 주행거리를 비교하려면 같은 기준을 세워야 한다.
- 연료 가격을 활용하면 어떨까? 1달러달 몇 km를 주행할 수 있는지 알아보자
- 2022년 시점의 가격
- 'Diesel', 'Petrol', 'LPG', 'CNG'

In [46]:
def mile(x):
    if x['fuel'] =='Petrol':
        return x['mileage'] / 1.048
    elif['fuel'] =='Diesel':
        return x['mileage'] / 1.405
    elif['fuel'] =='LPG':
        return x['mileage'] / 3.54
    else:
         return x['mileage'] / 2.76

In [47]:
data['mileage'] = data.apply(mile, axis = 1)

In [48]:
data.drop('mileage_unit'), axis=1, inplace=True

SyntaxError: cannot assign to function call (Temp/ipykernel_4048/2096606693.py, line 1)

### torque
- 앞부분의 숫자만 추출해서 숫자형
- 단위 스케일(Nm)

In [49]:
data['torque'].unique()

array(['190Nm@ 2000rpm', '250Nm@ 1500-2500rpm', '12.7@ 2,700(kgm@ rpm)',
       '22.4 kgm at 1750-2750rpm', '11.5@ 4,500(kgm@ rpm)',
       '113.75nm@ 4000rpm', '7.8@ 4,500(kgm@ rpm)', '59Nm@ 2500rpm',
       '170Nm@ 1800-2400rpm', '160Nm@ 2000rpm', '248Nm@ 2250rpm',
       '78Nm@ 4500rpm', nan, '84Nm@ 3500rpm', '115Nm@ 3500-3600rpm',
       '200Nm@ 1750rpm', '62Nm@ 3000rpm', '219.7Nm@ 1500-2750rpm',
       '114Nm@ 3500rpm', '115Nm@ 4000rpm', '69Nm@ 3500rpm',
       '172.5Nm@ 1750rpm', '6.1kgm@ 3000rpm', '114.7Nm@ 4000rpm',
       '60Nm@ 3500rpm', '90Nm@ 3500rpm', '151Nm@ 4850rpm',
       '104Nm@ 4000rpm', '320Nm@ 1700-2700rpm', '250Nm@ 1750-2500rpm',
       '145Nm@ 4600rpm', '146Nm@ 4800rpm', '343Nm@ 1400-3400rpm',
       '200Nm@ 1400-3400rpm', '200Nm@ 1250-4000rpm',
       '400Nm@ 2000-2500rpm', '138Nm@ 4400rpm', '360Nm@ 1200-3400rpm',
       '200Nm@ 1200-3600rpm', '380Nm@ 1750-2500rpm', '173Nm@ 4000rpm',
       '400Nm@ 1750-3000rpm', '400Nm@ 1400-2800rpm',
       '200Nm@ 1750-3000rp

In [50]:
data['torque'] = data['torque'].str.upper()

In [51]:
data['torque'].head()

0              190NM@ 2000RPM
1         250NM@ 1500-2500RPM
2       12.7@ 2,700(KGM@ RPM)
3    22.4 KGM AT 1750-2750RPM
4       11.5@ 4,500(KGM@ RPM)
Name: torque, dtype: object

In [52]:
data[['torque','torque_unit' ]] = data['mileage'].str.aplit(expand=True)

AttributeError: Can only use .str accessor with string values!

In [53]:
def split_num(x):
    x = str(x)
    for i, j in enumerate(x):
        if j not in '0123456789.':
            cut = i
            break
    return x[:cut]

In [56]:
data['torque'] = data['torque'].apply(split_num)

In [57]:
data['torque'].head()

0     190
1     250
2    12.7
3    22.4
4    11.5
Name: torque, dtype: object

In [58]:
def torque_unit(x):
    if 'NM' in str(x):
        return 'NM'
    elif 'KGM' in str(x):
        return 'KGM'

In [59]:
data['torque_unit'] = data['torque'].apply(torque_unit)

In [60]:
data['torque_unit'].unique()

array([None], dtype=object)

In [62]:
data[data['torque_unit'].isna()]['torque'].unique()

array(['190', '250', '12.7', '22.4', '11.5', '113.75', '7.8', '59', '170',
       '160', '248', '78', '', '84', '115', '200', '62', '219.7', '114',
       '69', '172.5', '6.1', '114.7', '60', '90', '151', '104', '320',
       '145', '146', '343', '400', '138', '360', '380', '173', '111.7',
       '219.6', '112', '130', '205', '280', '99.04', '77', '110', '153',
       '113.7', '113', '101', '290', '120', '96', '135', '259.8', '259.9',
       '91', '96.1', '109', '202', '430', '347', '382', '620', '500',
       '550', '490', '177.5', '300', '260', '213', '224', '640', '95',
       '13.1', '71', '14.9', '117', '72', '11.4', '140', '134', '150',
       '340', '240', '330', '12.5', '111.8', '11.8', '135.4', '190.25',
       '20.4', '247', '223', '180', '195', '154.9', '114.73', '108',
       '190.24', '420', '100', '51', '132', '350', '218', '24', '13.5',
       '85', '74.5', '180.4', '230', '219.66', '245', '204', '14.3',
       '125', '172', '102', '8.5', '106.5', '108.5', '144.15', '99'