# 보팅

In [24]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [26]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

c1f1 = LogisticRegression(multi_class = 'multinomial', random_state=1)
c1f2 = svm.SVC(kernel = 'linear', random_state=1)
c1f3 = GaussianNB()
clf_voting = VotingClassifier(
    estimators = [
        ('lr', c1f1),
        ('svm', c1f2),
        ('gnb', c1f3)
    ],
    voting = 'hard',
    weights = [1,1,1]
)
clf_voting.fit(X_train_std, Y_train['class'].ravel())

VotingClassifier(estimators=[('lr',
                              LogisticRegression(multi_class='multinomial',
                                                 random_state=1)),
                             ('svm', SVC(kernel='linear', random_state=1)),
                             ('gnb', GaussianNB())],
                 weights=[1, 1, 1])

In [34]:
#test(X_test)
pred_voting = clf_voting.predict(X_test_std) # 모델을 활용해서 문제를 풀었다.

#score(채점)
train_accyracy  = clf_voting.score(X_train_std, Y_train['class'].ravel()) # train score
test_accyracy = clf_voting.score(X_test_std, Y_test['class'].ravel()) # train score

print(train_accyracy)
print(test_accyracy)

0.9714285714285714
0.9777777777777777


In [38]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test, pred_voting)
print(conf_matrix)

[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]


# 배깅 

In [47]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [49]:
scaler  = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

# 일반 모델과 배깅 모델 비교 

## 일반모델

In [51]:
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train_std, Y_train['class'].ravel())

none_bagging_score = gaussian.score(X_test_std, Y_test['class'].ravel())
print(f'배깅을 활용하지 않았을 때 정확도는 {round(none_bagging_score,2)} 입니다.')

배깅을 활용하지 않았을 때 정확도는 0.94 입니다.


## 배깅모델

In [71]:
# 11시 8분까지
from sklearn.ensemble import BaggingClassifier
clf_bagging = BaggingClassifier(base_estimator = GaussianNB(),
                                n_estimators = 10,
                                random_state =0)
clf_bagging.fit(X_train_std, Y_train['class'].ravel())

bagging_train_score = clf_bagging.score(X_train_std, Y_train['class'].ravel())
bagging_test_score = clf_bagging.score(X_test_std, Y_test['class'].ravel())

print(f'배깅을 활용했을 때 훈련 정확도는 {round(bagging_train_score,2)} 입니다.')
print(f'배깅을 활용했을 때 테스트 정확도는 {round(bagging_test_score,2)} 입니다.')

배깅을 활용했을 때 훈련 정확도는 0.98 입니다.
배깅을 활용했을 때 테스트 정확도는 0.98 입니다.


# 랜덤포레스트

In [72]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

In [75]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [80]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth = 2,
                                n_estimators = 100)
clf_rf.fit(X_train_std, Y_train['class'].ravel())

RandomForestClassifier(max_depth=2)

In [83]:
pred_rf = clf_rf.predict(X_test_std)

rf_train_score = clf_rf.score(X_train_std, Y_train['class'].ravel())
rf_test_score = clf_rf.score(X_test_std, Y_test['class'].ravel())
print(f'랜덤포레스트 훈련 정확도는 {round(rf_train_score,2)} 입니다.')
print(f'랜덤포레스트 테스트 정확도는 {round(rf_test_score,2)} 입니다.')

랜덤포레스트 훈련 정확도는 0.98 입니다.
랜덤포레스트 테스트 정확도는 0.96 입니다.


In [84]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_rf)
print(conf_matrix)

[[19  0  0]
 [ 1 20  1]
 [ 0  0 13]]


In [85]:
from sklearn.metrics import classification_report
class_report = classification_report(Y_test['class'].ravel(), pred_rf)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.91      0.95        22
           2       0.93      1.00      0.96        13

    accuracy                           0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54



# 그레디언트 부스팅

In [86]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

In [87]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth = 3, learning_rate = 0.01)
gbrt.fit(X_train_std, Y_train['class'].ravel())

gbrt_train_score = gbrt.score(X_train_std, Y_train['class'].ravel())
gbrt_test_score = gbrt.score(X_test_std, Y_test['class'].ravel())

print(f'그레디언트부스팅 훈련 정확도는 {round(gbrt_train_score,2)} 입니다.')
print(f'그레디언트부스팅 테스트 정확도는 {round(gbrt_test_score,2)} 입니다.')

그레디언트부스팅 훈련 정확도는 1.0 입니다.
그레디언트부스팅 테스트 정확도는 0.94 입니다.


# 실습

In [93]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

## 랜덤포레스트

In [107]:
sclaer = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth = 2,
                                n_estimators = 100)
clf_rf.fit(X_train_std, Y_train['class'].ravel())

pred_rf = clf_rf.predict(X_test_std)
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_rf)
print(conf_matrix)

rf_train_score = clf_rf.score(X_train_std, Y_train['class'].ravel())
rf_test_score = clf_rf.score(X_test_std, Y_test['class'].ravel())
print(f'랜덤포레스트 훈련 정확도는 {round(rf_train_score,2)} 입니다.')
print(f'랜덤포레스트 테스트 정확도는 {round(rf_test_score,2)} 입니다.')

[[19  0  0]
 [ 1 20  1]
 [ 0  0 13]]
랜덤포레스트 훈련 정확도는 0.98 입니다.
랜덤포레스트 테스트 정확도는 0.96 입니다.


## 그레디언트 부스팅

In [106]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

sclaer = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
                        

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth = 3, learning_rate = 0.01)
gbrt.fit(X_train_std, Y_train['class'].ravel())

pred_gbrt = gbrt.predict(X_test_std)
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_gbrt)
print(conf_matrix)

gbrt_train_score = gbrt.score(X_train_std, Y_train['class'].ravel())
gbrt_test_score = gbrt.score(X_test_std, Y_test['class'].ravel())

print(f'그레디언트부스팅 훈련 정확도는 {round(gbrt_train_score,2)} 입니다.')
print(f'그레디언트부스팅 테스트 정확도는 {round(gbrt_test_score,2)} 입니다.')

[[18  1  0]
 [ 1 20  1]
 [ 0  0 13]]
그레디언트부스팅 훈련 정확도는 1.0 입니다.
그레디언트부스팅 테스트 정확도는 0.94 입니다.


# 스포츠센터 데이터 분석

In [176]:
import pandas as pd
customer_master = pd.read_csv('data/스포츠센터데이터/customer_master.csv') #유저정보
uselog = pd.read_csv('data/스포츠센터데이터/use_log.csv') #이용 기록
class_master = pd.read_csv('data/스포츠센터데이터/class_master.csv') #클래스
campaign_master = pd.read_csv('data/스포츠센터데이터/campaign_master.csv') #캠페인 이름

In [177]:
customer_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4192 entries, 0 to 4191
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  4192 non-null   object
 1   name         4192 non-null   object
 2   class        4192 non-null   object
 3   gender       4192 non-null   object
 4   start_date   4192 non-null   object
 5   end_date     1350 non-null   object
 6   campaign_id  4192 non-null   object
 7   is_deleted   4192 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 262.1+ KB


In [178]:
uselog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197428 entries, 0 to 197427
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   log_id       197428 non-null  object
 1   customer_id  197428 non-null  object
 2   usedate      197428 non-null  object
dtypes: object(3)
memory usage: 4.5+ MB


In [179]:
class_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   class       3 non-null      object
 1   class_name  3 non-null      object
 2   price       3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [180]:
campaign_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   campaign_id    3 non-null      object
 1   campaign_name  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [181]:
# info
# 결합
# 결측지 확인



# describe
# 성별을 숫자로 바꾼다.
# 필요없는 데이터 삭제
# 형변환
# 사용할 데이터 확인

In [182]:
join_campaign = customer_master.merge(campaign_master, on = 'campaign_id')
join_class = join_campaign.merge(class_master, on = 'class')
customer_join = join_class.drop(['class','campaign_id','name'], axis = 1)

In [183]:
customer_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4192 entries, 0 to 4191
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    4192 non-null   object
 1   gender         4192 non-null   object
 2   start_date     4192 non-null   object
 3   end_date       1350 non-null   object
 4   is_deleted     4192 non-null   int64 
 5   campaign_name  4192 non-null   object
 6   class_name     4192 non-null   object
 7   price          4192 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 294.8+ KB


In [184]:
customer_join.head(1)

Unnamed: 0,customer_id,gender,start_date,end_date,is_deleted,campaign_name,class_name,price
0,OA832399,F,2015-05-01 00:00:00,,0,2_일반,0_종일,10500


In [185]:
uselog.head(1)

Unnamed: 0,log_id,customer_id,usedate
0,L00000049012330,AS009373,2018-04-01


In [186]:
customer_join.isnull().sum()

customer_id         0
gender              0
start_date          0
end_date         2842
is_deleted          0
campaign_name       0
class_name          0
price               0
dtype: int64

In [187]:
#결축치 채우고 형변환
customer_join['calc_date'] = customer_join['end_date']
customer_join['calc_date'] = customer_join['calc_date'].fillna(pd.to_datetime("20190430")) # 3월까지 데이터라서 none값을 4월로 채운다.
customer_join['start_date'] = pd.to_datetime(customer_join['start_date'])
customer_join['calc_date'] = pd.to_datetime(customer_join['calc_date'])

In [188]:
customer_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4192 entries, 0 to 4191
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   customer_id    4192 non-null   object        
 1   gender         4192 non-null   object        
 2   start_date     4192 non-null   datetime64[ns]
 3   end_date       1350 non-null   object        
 4   is_deleted     4192 non-null   int64         
 5   campaign_name  4192 non-null   object        
 6   class_name     4192 non-null   object        
 7   price          4192 non-null   int64         
 8   calc_date      4192 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(5)
memory usage: 327.5+ KB


In [189]:
# groupby
# customer_join['membership_period'] = 0
customer_join['membership_period'] = 0
from dateutil.relativedelta import relativedelta
for row_num in range(len(customer_join)):
    delta = relativedelta(customer_join['calc_date'].iloc[row_num], customer_join['start_date'].iloc[row_num])
    month_diff = delta.years *12 + delta.months
    customer_join['membership_period'].iloc[row_num] = month_diff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_join['membership_period'].iloc[row_num] = month_diff


In [190]:
uselog

Unnamed: 0,log_id,customer_id,usedate
0,L00000049012330,AS009373,2018-04-01
1,L00000049012331,AS015315,2018-04-01
2,L00000049012332,AS040841,2018-04-01
3,L00000049012333,AS046594,2018-04-01
4,L00000049012334,AS073285,2018-04-01
...,...,...,...
197423,L00000049209753,TS977703,2019-03-31
197424,L00000049209754,TS979550,2019-03-31
197425,L00000049209755,TS995299,2019-03-31
197426,L00000049209756,TS995853,2019-03-31


In [192]:
uselog['usedate'] = pd.to_datetime(uselog['usedate'])
uselog['연월'] = uselog['usedate'].dt.strftime('%Y%m')
uselog_months = uselog.groupby(['customer_id', '연월'], as_index = False).agg(count = ('log_id', 'count'))
uselog_months.head()

Unnamed: 0,customer_id,연월,count
0,AS002855,201804,4
1,AS002855,201805,5
2,AS002855,201806,5
3,AS002855,201807,5
4,AS002855,201808,3


In [198]:
uselog_customer = uselog_months.groupby('customer_id', as_index = False).agg(['mean','median','max','min'])['count']
uselog_customer = uselog_customer.reset_index()

In [204]:
uselog['weekday'] = uselog['usedate'].dt.weekday
uselog_weekday = uselog.groupby(['customer_id', '연월','weekday'], as_index = False).agg(count = ('log_id', 'count'))
uselog_weekday.head()

Unnamed: 0,customer_id,연월,weekday,count
0,AS002855,201804,5,4
1,AS002855,201805,2,1
2,AS002855,201805,5,4
3,AS002855,201806,5,5
4,AS002855,201807,1,1


In [205]:
uselog_weekday = uselog_weekday.groupby('customer_id', as_index = False).agg(count = ('count', 'max'))

In [206]:
uselog_weekday['routine_flg'] = 0
uselog_weekday['routine_flg'] = uselog_weekday['routine_flg'].where(uselog_weekday['count']<4, 1) # 조건에 해당되지 않는 경우에 1로 수정

In [208]:
uselog_weekday.head()

Unnamed: 0,customer_id,count,routine_flg
0,AS002855,5,1
1,AS008805,4,1
2,AS009013,2,0
3,AS009373,5,1
4,AS015233,5,1


In [211]:
uselog_customer.head()

Unnamed: 0,customer_id,mean,median,max,min
0,AS002855,4.5,5.0,7,2
1,AS008805,4.0,4.0,8,1
2,AS009013,2.0,2.0,2,2
3,AS009373,5.083333,5.0,7,3
4,AS015233,7.545455,7.0,11,4


In [215]:
merge_uselog_customer = customer_join.merge(uselog_customer, on = 'customer_id')
merge_uselog_weekday = merge_uselog_customer.merge(uselog_weekday.drop(['count'], axis = 1), on = 'customer_id')
merge_uselog_weekday.head()

Unnamed: 0,customer_id,gender,start_date,end_date,is_deleted,campaign_name,class_name,price,calc_date,membership_period,mean,median,max,min,routine_flg
0,OA832399,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,5.0,8,2,1
1,PL270116,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,5.083333,5.0,7,3,1
2,OA974876,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.583333,5.0,6,3,1
3,HD024127,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,4.5,7,2,1
4,IK271057,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,3.75,3.5,5,2,1
