# 실습 내용

- Mobile 데이터로 모델링합니다.
- Logistic Regression 알고리즘으로 모델링합니다.

![image.png](attachment:57e957b4-a9e0-451e-a5a9-e9f6994f0498.png)

# 1.환경 준비

- 기본 라이브러리와 대상 데이터를 가져와 이후 과정을 준비합니다.

In [1]:
# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'

In [2]:
# 데이터 읽어오기
path = 'https://raw.githubusercontent.com/jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)

# 2.데이터 이해

- 분석할 데이터를 충분히 이해할 수 있도록 다양한 탐색 과정을 수행합니다.

In [3]:
# 상위 몇 개 행 확인
data.head()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
0,1,0,31953,0,6,313378,161,0,4,unsat,little,no,STAY
1,2,1,36147,0,13,800586,244,0,6,unsat,little,considering,STAY
2,3,1,27273,230,0,305049,201,16,15,unsat,very_little,perhaps,STAY
3,4,0,120070,38,33,788235,780,3,2,unsat,very_high,considering,LEAVE
4,5,1,29215,208,85,224784,241,21,1,very_unsat,little,never_thought,STAY


**데이터 설명**

- COLLEGE: 대학 졸업여부
- INCOME: 연수입
- OVERAGE: 월평균 초과사용 시간(분)
- LEFTOVER: 월평균 잔여시간비율(%)
- HOUSE: 집값
- HANDSET_PRICE: 스마트폰 가격
- OVER_15MINS_CALLS_PER_MONTH: 월평균 장기통화(15분이상) 횟수
- AVERAGE_CALL_DURATION: 평균 통화 시간
- REPORTED_SATISFACTION: 만족도 설문조사 결과
- REPORTED_USAGE_LEVEL: 사용도 자가진단 결과
- CONSIDERING_CHANGE_OF_PLAN: 향후 변경계획 설문조사 결과
- CHURN: 이탈(번호이동) 여부 (Target 변수)

In [4]:
# 변수 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           20000 non-null  int64 
 1   COLLEGE                      20000 non-null  int64 
 2   INCOME                       20000 non-null  int64 
 3   OVERAGE                      20000 non-null  int64 
 4   LEFTOVER                     20000 non-null  int64 
 5   HOUSE                        20000 non-null  int64 
 6   HANDSET_PRICE                20000 non-null  int64 
 7   OVER_15MINS_CALLS_PER_MONTH  20000 non-null  int64 
 8   AVERAGE_CALL_DURATION        20000 non-null  int64 
 9   REPORTED_SATISFACTION        20000 non-null  object
 10  REPORTED_USAGE_LEVEL         20000 non-null  object
 11  CONSIDERING_CHANGE_OF_PLAN   20000 non-null  object
 12  CHURN                        20000 non-null  object
dtypes: int64(9), object(4)
memory u

In [5]:
# 기술통계 확인
data.describe()

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,10000.5,0.5024,80281.44775,85.97955,23.89865,493155.26425,389.6151,8.0007,6.00225
std,5773.647028,0.500007,41680.586319,85.992324,26.816645,252407.884692,213.820682,8.925418,4.402558
min,1.0,0.0,20007.0,-2.0,0.0,150002.0,130.0,0.0,1.0
25%,5000.75,0.0,42217.0,0.0,0.0,263714.25,219.0,1.0,2.0
50%,10000.5,1.0,75366.5,59.0,14.0,452259.5,326.0,4.0,5.0
75%,15000.25,1.0,115881.75,179.0,41.0,702378.0,533.25,15.0,10.0
max,20000.0,1.0,159983.0,335.0,89.0,999996.0,899.0,29.0,15.0


In [6]:
# target 값 개수 확인
data['CHURN'].value_counts()

CHURN
STAY     10148
LEAVE     9852
Name: count, dtype: int64

In [7]:
# 상관관계 확인
data.corr(numeric_only=True)

Unnamed: 0,id,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION
id,1.0,-0.005557,0.003686,-0.00605,0.006069,0.011347,-0.007838,0.001254,-0.00583
COLLEGE,-0.005557,1.0,0.011122,-0.003091,-0.003925,-0.000217,0.00995,-0.007205,-0.00149
INCOME,0.003686,0.011122,1.0,0.000458,0.006515,-0.010964,0.7272,0.002136,-0.007219
OVERAGE,-0.00605,-0.003091,0.000458,1.0,-0.003123,0.002412,0.000324,0.770557,0.000653
LEFTOVER,0.006069,-0.003925,0.006515,-0.003123,1.0,0.00653,0.004004,-0.010411,-0.660285
HOUSE,0.011347,-0.000217,-0.010964,0.002412,0.00653,1.0,-0.007756,0.00741,-0.009359
HANDSET_PRICE,-0.007838,0.00995,0.7272,0.000324,0.004004,-0.007756,1.0,0.00268,-0.00519
OVER_15MINS_CALLS_PER_MONTH,0.001254,-0.007205,0.002136,0.770557,-0.010411,0.00741,0.00268,1.0,0.007769
AVERAGE_CALL_DURATION,-0.00583,-0.00149,-0.007219,0.000653,-0.660285,-0.009359,-0.00519,0.007769,1.0


# 3.데이터 준비

- 전처리 과정을 통해 머신러닝 알고리즘에 사용할 수 있는 형태의 데이터를 준비합니다.

**1) 변수 제거**

In [8]:
# 제거 대상: id
drop_cols = ['id']

# 변수 제거
data.drop(drop_cols, axis = 1, inplace = True)

# 확인
data.head()

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION,REPORTED_USAGE_LEVEL,CONSIDERING_CHANGE_OF_PLAN,CHURN
0,0,31953,0,6,313378,161,0,4,unsat,little,no,STAY
1,1,36147,0,13,800586,244,0,6,unsat,little,considering,STAY
2,1,27273,230,0,305049,201,16,15,unsat,very_little,perhaps,STAY
3,0,120070,38,33,788235,780,3,2,unsat,very_high,considering,LEAVE
4,1,29215,208,85,224784,241,21,1,very_unsat,little,never_thought,STAY


**2) x, y 분리**

In [9]:
# Target 설정
target = 'CHURN'

# 데이터 분리
x = data.drop(target, axis =1)
y = data.loc[:, target]

**3) 가변수화**

In [10]:
# 가변수화 대상: REPORTED_SATISFACTION, REPORTED_USAGE_LEVEL, CONSIDERING_CHANGE_OF_PLAN
dumm_cols = ['REPORTED_SATISFACTION', 'REPORTED_USAGE_LEVEL', 'CONSIDERING_CHANGE_OF_PLAN']

# 가변수화
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)

# 확인
x.head()

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION_sat,REPORTED_SATISFACTION_unsat,REPORTED_SATISFACTION_very_sat,REPORTED_SATISFACTION_very_unsat,REPORTED_USAGE_LEVEL_high,REPORTED_USAGE_LEVEL_little,REPORTED_USAGE_LEVEL_very_high,REPORTED_USAGE_LEVEL_very_little,CONSIDERING_CHANGE_OF_PLAN_considering,CONSIDERING_CHANGE_OF_PLAN_never_thought,CONSIDERING_CHANGE_OF_PLAN_no,CONSIDERING_CHANGE_OF_PLAN_perhaps
0,0,31953,0,6,313378,161,0,4,0,1,0,0,0,1,0,0,0,0,1,0
1,1,36147,0,13,800586,244,0,6,0,1,0,0,0,1,0,0,1,0,0,0
2,1,27273,230,0,305049,201,16,15,0,1,0,0,0,0,0,1,0,0,0,1
3,0,120070,38,33,788235,780,3,2,0,1,0,0,0,0,1,0,1,0,0,0
4,1,29215,208,85,224784,241,21,1,0,0,0,1,0,1,0,0,0,1,0,0


**4) 학습용, 평가용 데이터 분리**

In [11]:
# 모듈 불러오기
from sklearn.model_selection import train_test_split

# 7:3으로 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

# 4.모델링

- 본격적으로 모델을 선언하고 학습하고 평가하는 과정을 진행합니다.

In [12]:
# 1단계: 불러오기
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [13]:
# 2단계: 선언하기
model = LogisticRegression()

In [14]:
# 3단계: 학습하기
model.fit(x_train, y_train)

In [15]:
# 4단계: 예측하기
y_pred = model.predict(x_test)

In [16]:
# 5단계: 평가하기
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1795 1140]
 [1104 1961]]
              precision    recall  f1-score   support

       LEAVE       0.62      0.61      0.62      2935
        STAY       0.63      0.64      0.64      3065

    accuracy                           0.63      6000
   macro avg       0.63      0.63      0.63      6000
weighted avg       0.63      0.63      0.63      6000



In [19]:
p = model.predict_proba(x_test)
p1 = p[:, [1]]

# 임계값 
y_pred2 = np.array(['STAY' if x > 0.45 else 'LEAVE' for x in p1])
print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

       LEAVE       0.62      0.61      0.62      2935
        STAY       0.63      0.64      0.64      3065

    accuracy                           0.63      6000
   macro avg       0.63      0.63      0.63      6000
weighted avg       0.63      0.63      0.63      6000

              precision    recall  f1-score   support

       LEAVE       0.67      0.51      0.58      2935
        STAY       0.62      0.76      0.68      3065

    accuracy                           0.64      6000
   macro avg       0.64      0.64      0.63      6000
weighted avg       0.64      0.64      0.63      6000

