#### 머신러닝

### 라이브러리 선언 및 데이터 불러오기

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.tree import plot_tree
import pickle

In [23]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
dataUrl = "https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/customer.csv"

In [4]:
featuresData = pd.read_csv(dataUrl)
featuresData.head(2)

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond


### 1. 데이터 통합 및 숫자컬럼 추가

In [None]:
# 데이터타입 통합 SKIP

In [7]:
# 컬럼 대문자로 변환
featuresData.columns = featuresData.columns.str.upper()

In [10]:
# BALANCE STOCK SCALING

In [12]:
balanceScale = MinMaxScaler()
stockScale = MinMaxScaler()

In [16]:
featuresData["SCALE_BALANCE"] = balanceScale.fit_transform(featuresData.loc[:,["BALANCE"]])
featuresData["SCALE_STOCK"] = stockScale.fit_transform(featuresData.loc[:,["STOCK"]])

In [21]:
### 추가 포인트: 정답지가 카테고리라서 숫자로 변경 해야한다

In [26]:
labelEncoder = LabelEncoder()

In [35]:
labelDict = {"normal":0,
             "diamond":1,
             "vip":2}

In [37]:
featuresData["LE2_LABEL"] = featuresData.LABEL.map( labelDict )

In [28]:
featuresData["LE_LABEL"] = labelEncoder.fit_transform( featuresData.LABEL     )

In [42]:
# label을 하기위해 특성값의 조합이 크고 작음을 나타내기 위해 곱하기 연산 수행
featuresData["NEW_FEATURES"] = featuresData.SCALE_BALANCE *  featuresData.SCALE_STOCK
groupKey = ["LABEL"]
# 신규특성값 기준 워하는 컬럼에 대해서 그룹바이 수행
# 라벨 컬럼을 신규FEAURE값의 크기를 기준으로 값을 비교한다!!
groupData = featuresData.groupby( groupKey )["NEW_FEATURES"].agg(["mean"]).reset_index()
groupData = groupData.rename(columns = {"mean":"NEW_FEATURE_MEAN"})
sortKey = ["NEW_FEATURE_MEAN"]
sortedValue = groupData.sort_values( sortKey ).reset_index(drop=True)
sortedValue

In [70]:
sortedValue["LABEL"].to_dict()

{0: 'normal', 1: 'diamond', 2: 'vip'}

In [75]:
labelDict = {}
for idx, row in  sortedValue.iterrows():
    labelDict[ row["LABEL"]] = idx

In [88]:
labelDict = {}
labelList = sortedValue["LABEL"].tolist()
for i in range(0, len(labelList)):
    labelDict[ labelList[i] ] = i
labelDict

In [92]:
labelDict

{'normal': 0, 'diamond': 1, 'vip': 2}

In [100]:
featuresData = featuresData.drop(columns=["LE_LABEL","LE2_LABEL","NEW_FEATURES"] )

In [102]:
featuresData["LE_LABEL"] = featuresData.LABEL.map(  labelDict )

### 2. 특성 선정 및 데이터 분리

In [30]:
label = ["LABEL"]

In [104]:
corrStd = 0.5

In [105]:
corrDf = featuresData.corr(numeric_only=True)

In [None]:
label = ["QTY"]

In [110]:
# 정답지와 상관계수가 설정값 ( 0.5 ) 보다 큰 feature를 자동으로 찾는 코드
featuresTarget = list( corrDf.loc[  ( abs( corrDf["LE_LABEL"] )  > corrStd ) &
             (corrDf["LE_LABEL"] != 1) ].index )

In [113]:
preFix = "SCALE"

In [125]:
features = []

In [126]:
for i in range(0, len(featuresTarget)):
    if featuresTarget[i].count(preFix) >= 1:
        features.append( featuresTarget[i] )
    else:
        pass

In [129]:
randomValue = 30
testSizeRatio = 0.2

In [130]:
trainData, testData = train_test_split( featuresData,
                                        random_state=randomValue, test_size=testSizeRatio  )

In [131]:
trainDataFeatures = trainData.loc[ :, features ]
trainDataLabel = trainData.loc[ :, label ]
testDataFeatures = testData.loc[ :, features ]
testDataLabel = testData.loc[ :, label ]

In [132]:
print( trainDataFeatures.shape )
print( trainDataLabel.shape )
print( testDataFeatures.shape )
print( testDataLabel.shape )

(16000, 2)
(16000, 1)
(4000, 2)
(4000, 1)


### 3. 모델 정의 및 훈련

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [134]:
dtModel = tree.DecisionTreeClassifier(random_state=randomValue)

In [135]:
fittedModel = dtModel.fit(trainDataFeatures,
            trainDataLabel)

### 4. 예측

In [136]:
fittedModel

In [138]:
inBalance = 280000000
inStock = 48000000

In [140]:
inBalancePr = (inBalance - balanceScale.data_min_) / (balanceScale.data_max_ - balanceScale.data_min_)
inStockPr = (inStock - stockScale.data_min_) / (stockScale.data_max_ - stockScale.data_min_)

In [142]:
testDf = pd.DataFrame( [[ inBalancePr, inStockPr]] )

In [143]:
predictValue = fittedModel.predict (testDf)
predictValue



array(['diamond'], dtype=object)

In [144]:
with open("./mlcorecs.dump", "wb") as fw:
    pickle.dump(fittedModel, fw)

In [None]:
fittedModel