# 보팅

In [1]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

In [2]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

In [3]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

c1f1 = LogisticRegression(multi_class = 'multinomial', random_state=1)
c1f2 = svm.SVC(kernel = 'linear', random_state=1)
c1f3 = GaussianNB()
clf_voting = VotingClassifier(
    estimators = [
        ('lr', c1f1),
        ('svm', c1f2),
        ('gnb', c1f3)
    ],
    voting = 'hard',
    weights = [1,1,1]
)
clf_voting.fit(X_train_std, Y_train['class'].ravel())

VotingClassifier(estimators=[('lr',
                              LogisticRegression(multi_class='multinomial',
                                                 random_state=1)),
                             ('svm', SVC(kernel='linear', random_state=1)),
                             ('gnb', GaussianNB())],
                 weights=[1, 1, 1])

In [23]:
# test(테스트 실시)
pred_voting = clf_voting.predict(X_test_std)

# score(채점)
train_accyracy = clf_voting.score(X_train_std, Y_train['class'].ravel()) 
test_accyracy = clf_voting.score(X_test_std, Y_test['class'].ravel()) 

0.9777777777777777

In [30]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_voting)
print(conf_matrix)

[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]


# Bagging

In [31]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

In [33]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

## 일반 모델과 배깅 모델 비교

### 일반모델

In [39]:
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train_std, Y_train['class'].ravel())

none_bagging_score = gaussian.score(X_test_std, Y_test['class'].ravel())
print(f'배깅을 활용하지 않았을 때 정확도는 {round(none_bagging_score,2)} 입니다.')

배깅을 활용하지 않았을 때 정확도는 0.94 입니다.


### 배깅 모델

In [44]:
# 11시 8분까지
from sklearn.ensemble import BaggingClassifier
clf_bagging = BaggingClassifier(base_estimator = GaussianNB(),
                                n_estimators = 10,
                                random_state =0)
clf_bagging.fit(X_train_std, Y_train['class'].ravel())

bagging_train_score = clf_bagging.score(X_train_std, Y_train['class'].ravel())
bagging_test_score = clf_bagging.score(X_test_std, Y_test['class'].ravel())
print(f'배깅을 활용 했을 때 훈련 정확도는 {round(bagging_train_score,2)} 입니다.')
print(f'배깅을 활용 했을 때 테스트 정확도는 {round(bagging_test_score,2)} 입니다.')

배깅을 활용하지 했을 때 훈련 정확도는 0.98 입니다.
배깅을 활용하지 했을 때 테스트 정확도는 0.98 입니다.


In [45]:
pred_bagging = clf_bagging.predict(X_test_std)

In [46]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_bagging)
print(conf_matrix)

[[19  0  0]
 [ 1 21  0]
 [ 0  0 13]]


In [47]:
from sklearn.metrics import classification_report
class_report = classification_report(Y_test['class'].ravel(), pred_bagging)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.95      0.98        22
           2       1.00      1.00      1.00        13

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



# 랜덤포레스트

In [76]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

In [77]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [78]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth = 6,
                                n_estimators=150)
clf_rf.fit(X_train_std, Y_train['class'].ravel())

RandomForestClassifier(max_depth=6, n_estimators=150)

In [79]:
pred_rf = clf_rf.predict(X_test_std)

rf_train_score = clf_rf.score(X_train_std, Y_train['class'].ravel())
rf_test_score = clf_rf.score(X_test_std, Y_test['class'].ravel())
print(f'랜덤포레스트 훈련 정확도는 {round(rf_train_score,2)} 입니다.')
print(f'랜덤포레스트 테스트 정확도는 {round(rf_test_score,2)} 입니다.')

랜덤포레스트 훈련 정확도는 1.0 입니다.
랜덤포레스트 테스트 정확도는 0.98 입니다.


In [59]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test['class'].ravel(), pred_rf)
print(conf_matrix)

[[19  0  0]
 [ 1 20  1]
 [ 0  0 13]]


In [60]:
from sklearn.metrics import classification_report
class_report = classification_report(Y_test['class'].ravel(), pred_rf)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.91      0.95        22
           2       0.93      1.00      0.96        13

    accuracy                           0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54



# 그레디언트 부스팅

In [80]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

wine_data = load_wine()
X = pd.DataFrame(wine_data.data, columns = wine_data.feature_names)
Y = pd.DataFrame(wine_data.target, columns = ['class'])

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

In [81]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [93]:
# 11
from sklearn.ensemble import GradientBoostingClassifier

# default  max_depth = 3, learning_rate = 0.1
gbrt = GradientBoostingClassifier(max_depth = 2, learning_rate = 0.05)
gbrt.fit(X_train_std, Y_train['class'].ravel())

gbrt_train_score = gbrt.score(X_train_std, Y_train['class'].ravel())
gbrt_test_score = gbrt.score(X_test_std, Y_test['class'].ravel())
print(f'그레디언트부스팅 훈련 정확도는 {round(gbrt_train_score,2)} 입니다.')
print(f'그레디언트부스팅 테스트 정확도는 {round(gbrt_test_score,2)} 입니다.')

그레디언트부스팅 훈련 정확도는 1.0 입니다.
그레디언트부스팅 테스트 정확도는 0.98 입니다.


# 실습

In [94]:
from sklearn.datasets import load_iris
import pandas as pd

iris_data = load_iris()
X = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
Y = pd.DataFrame(iris_data.target, columns = ['class'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

## 랜덤포레스트

In [95]:
clf_rf = RandomForestClassifier(max_depth = 4, n_estimators = 100)
clf_rf.fit(X_train_std, Y_train['class'].ravel())
pred_rf = clf_rf.predict(X_test_std)

rf_train_score = clf_rf.score(X_train_std, Y_train['class'].ravel())
rf_test_score = clf_rf.score(X_test_std, Y_test['class'].ravel())
print(f'랜덤포레스트 훈련 정확도는 {round(rf_train_score,2)} 입니다.')
print(f'랜덤포레스트 테스트 정확도는 {round(rf_test_score,2)} 입니다.')

랜덤포레스트 훈련 정확도는 1.0 입니다.
랜덤포레스트 테스트 정확도는 0.98 입니다.


In [96]:
pred_rf = clf_rf.predict(X_test_std)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rf_conf = confusion_matrix(Y_test['class'].ravel(), pred_rf)
print(conf_matrix)

rf_report = classification_report(Y_test['class'].ravel(), pred_rf)
print(rf_report)

[[19  0  0]
 [ 1 20  1]
 [ 0  0 13]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



## 그레디언트 부스팅

In [102]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=4, learning_rate=0.1)
gb.fit(X_train_std,Y_train['class'].ravel())


gbrt_train_score = gb.score(X_train_std, Y_train['class'].ravel())
gbrt_test_score = gb.score(X_test_std, Y_test['class'].ravel())
print(f'그레디언트부스팅 훈련 정확도는 {round(gbrt_train_score,2)} 입니다.')
print(f'그레디언트부스팅 테스트 정확도는 {round(gbrt_test_score,2)} 입니다.')

그레디언트부스팅 훈련 정확도는 1.0 입니다.
그레디언트부스팅 테스트 정확도는 0.98 입니다.


In [105]:
pred_gb = gb.predict(X_test_std)

gb_conf = confusion_matrix(Y_test['class'].ravel(), pred_gb)
print(gb_conf)

gb_report = classification_report(Y_test['class'].ravel(), pred_gb)
print(gb_report)

[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [111]:
print(Y_test['class'].ravel())
print(pred_gb)
print(pred_rf)

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 1 1 1 2 0 2 0 0]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0]


# 스포츠센터 데이터분석

In [2]:
import pandas as pd
customer_master = pd.read_csv('data/스포츠센터데이터/customer_master.csv') # 유저 정보
uselog = pd.read_csv('data/스포츠센터데이터/use_log.csv') # 이용 기록
class_master = pd.read_csv('data/스포츠센터데이터/class_master.csv') # 클래스
campaign_master = pd.read_csv('data/스포츠센터데이터/campaign_master.csv') # 캠페인 이름

In [3]:
join_campaign = customer_master.merge(campaign_master, on = 'campaign_id') # 데이터 결합
join_class = join_campaign.merge(class_master, on = 'class') # 데이터 결합
customer_join = join_class.drop(['class', 'campaign_id', 'name'], axis = 1) # 필요없는 칼럼 삭제

In [4]:
customer_join.head(1)

Unnamed: 0,customer_id,gender,start_date,end_date,is_deleted,campaign_name,class_name,price
0,OA832399,F,2015-05-01 00:00:00,,0,2_일반,0_종일,10500


In [5]:
uselog.head(1)

Unnamed: 0,log_id,customer_id,usedate
0,L00000049012330,AS009373,2018-04-01


In [6]:
customer_join.isnull().sum() # 결측치확인

customer_id         0
gender              0
start_date          0
end_date         2842
is_deleted          0
campaign_name       0
class_name          0
price               0
dtype: int64

In [7]:
# 결측치 채우고 형변환
customer_join['calc_date'] = customer_join['end_date']
customer_join['calc_date'] = customer_join['calc_date'].fillna(pd.to_datetime("20190430")) # 3월 까지 데이터라서 None을 4월 30일로 채운다
customer_join['start_date'] = pd.to_datetime(customer_join['start_date'])
customer_join['calc_date'] = pd.to_datetime(customer_join['calc_date'])

In [8]:
# customer_join['membership_period']  = ((customer_join['calc_date'] - customer_join['start_date'])/30).dt.days
customer_join['membership_period'] = None
from dateutil.relativedelta import relativedelta
for row_num in range(len(customer_join)):
  delta = relativedelta(customer_join['calc_date'].iloc[row_num], customer_join['start_date'].iloc[row_num])
  month_diff = delta.years * 12 + delta.months
  customer_join['membership_period'].iloc[row_num] = month_diff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_join['membership_period'].iloc[row_num] = month_diff


In [9]:
# membership_period_list = []
# from dateutil.relativedelta import relativedelta
# for row_num in range(len(customer_join)):
#   delta = relativedelta(customer_join['calc_date'].iloc[row_num], customer_join['start_date'].iloc[row_num])
#   month_diff = delta.years * 12 + delta.months
#   membership_period_list.append(month_diff)
# customer_join['membership_period'] = membership_period_list

In [10]:
# customer_join.apply(lambda x: relativedelta(x['calc_date'], x['start_date']))

In [11]:
# delta = customer_join.apply(lambda x: relativedelta(x.calc_date, x.start_date), axis=1)
# customer_join['membership_period']  = delta.apply(lambda x: x.years * 12 + x.months)

In [12]:
uselog['usedate'] = pd.to_datetime(uselog['usedate'])
uselog['연월'] = uselog['usedate'].dt.strftime('%Y%m')
userlog_months = uselog.groupby(['customer_id', '연월'], as_index = False).agg(count = ('log_id', 'count'))
userlog_months.head()

Unnamed: 0,customer_id,연월,count
0,AS002855,201804,4
1,AS002855,201805,5
2,AS002855,201806,5
3,AS002855,201807,5
4,AS002855,201808,3


In [13]:
uselog_customer = userlog_months.groupby(
    'customer_id', as_index = False).agg(
    ['mean', 'median', 'max', 'min'])['count']
uselog_customer = uselog_customer.reset_index()

In [14]:
uselog['weekday'] = uselog['usedate'].dt.day_name()
uselog_weekday = uselog.groupby(['customer_id','연월', 'weekday'], as_index = False).agg(count = ('log_id', 'count'))
uselog_weekday.head()

Unnamed: 0,customer_id,연월,weekday,count
0,AS002855,201804,Saturday,4
1,AS002855,201805,Saturday,4
2,AS002855,201805,Wednesday,1
3,AS002855,201806,Saturday,5
4,AS002855,201807,Saturday,4


In [15]:
uselog_weekday = uselog_weekday.groupby('customer_id', as_index = False).agg(count = ('count', 'max'))

In [16]:
uselog_weekday['routine_flg'] = 0
uselog_weekday['routine_flg'] = uselog_weekday['routine_flg'].where(uselog_weekday['count']<4, 1) # 조건에 해당되지 않는 경우에 1로 수정

In [17]:
merge_uselog_customer = customer_join.merge(uselog_customer, on = 'customer_id')
merge_uselog_weekday = merge_uselog_customer.merge(uselog_weekday.drop(['count'], axis = 1), on = 'customer_id')
merge_uselog_weekday.head()

Unnamed: 0,customer_id,gender,start_date,end_date,is_deleted,campaign_name,class_name,price,calc_date,membership_period,mean,median,max,min,routine_flg
0,OA832399,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,5.0,8,2,1
1,PL270116,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,5.083333,5.0,7,3,1
2,OA974876,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.583333,5.0,6,3,1
3,HD024127,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,4.5,7,2,1
4,IK271057,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,3.75,3.5,5,2,1


In [18]:
merge_uselog_weekday.isnull().sum()

customer_id             0
gender                  0
start_date              0
end_date             2842
is_deleted              0
campaign_name           0
class_name              0
price                   0
calc_date               0
membership_period       0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
dtype: int64

In [19]:
merge_uselog_weekday.columns

Index(['customer_id', 'gender', 'start_date', 'end_date', 'is_deleted',
       'campaign_name', 'class_name', 'price', 'calc_date',
       'membership_period', 'mean', 'median', 'max', 'min', 'routine_flg'],
      dtype='object')

In [20]:
merge_uselog_weekday.head()

Unnamed: 0,customer_id,gender,start_date,end_date,is_deleted,campaign_name,class_name,price,calc_date,membership_period,mean,median,max,min,routine_flg
0,OA832399,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,5.0,8,2,1
1,PL270116,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,5.083333,5.0,7,3,1
2,OA974876,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.583333,5.0,6,3,1
3,HD024127,F,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,4.833333,4.5,7,2,1
4,IK271057,M,2015-05-01,,0,2_일반,0_종일,10500,2019-04-30,47,3.75,3.5,5,2,1


In [22]:
customer_stay = merge_uselog_weekday[merge_uselog_weekday['is_deleted'] == 0] # stay
customer_end = merge_uselog_weekday[merge_uselog_weekday['is_deleted'] == 1] # end
customer_stay.to_csv('data/스포츠센터데이터/customer_stay.csv')
customer_end.to_csv('data/스포츠센터데이터/customer_end.csv')
merge_uselog_weekday.to_csv('pre_data.csv')

In [None]:
# 클래스, 캠페인 여부 별 회원유지기간
# 요일별 방문 수 
# 여자 남자 중에 누가 더 많이 오는지? 
# 남여별 정기적 횟수
# 클래스별 그만둔회원수
# end_date와 routineflg의 관계 
# 이용이 가정 적은, 많은 달과 요일
# 등록한 계절 별 회원기간
# 클래스 별 routine flg