# 다중선형회귀분석

In [1]:
# 데이터 구성:Series, DataFrame
import pandas as pd
# 행렬 연산
import numpy as np
# 데이터 시각화
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# 회귀분석
from statsmodels.formula.api import ols
# qqplot, 회귀분석의 상수항 추가
from statsmodels.api import qqplot, add_constant
# 선형 모델 formula(y ~ X1 + X2 + ...)
import statsmodels.formula.api as smf
# 다중공선성 확인
from statsmodels.stats.outliers_influence import variance_inflation_factor
# 학습용/평가용 데이터 분리:train, test
from sklearn.model_selection import train_test_split
# 선형회귀모형
from sklearn.linear_model import LinearRegression
# 변수 선택법(후진제거법)
from sklearn.feature_selection import RFE
# Scale 변환->표준화 회귀계수 산출
from sklearn.preprocessing import StandardScaler
# 평가함수
from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import r2_score

In [2]:
df_raw = pd.read_csv('add_variable_customer.csv', encoding='euc-kr')

In [3]:
df_raw.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df_raw

Unnamed: 0,CustomerId,Gender,Age,AgeGroup,Married,Dependents,noDependents,Referrals,noReferrals,PaperlessBilling,...,SatisScore,TotalExtraDataCharge,AvgRoamCharge,TotalRoamCharge,TenureMonths,AvgCharge,TotalCharge,TotalRevenue,MonthRoamCharge,AvgExtraDataCharge
0,C-10001,여성,37,30,Yes,No,0,Yes,2,Yes,...,3,0,47910,431100,21,83590.476190,1755400,2186500,20528,0
1,C-10002,남성,46,40,No,No,0,No,0,No,...,5,11300,12090,108710,21,68928.571429,1447500,1567510,5176,538
2,C-10003,남성,50,50,No,No,0,No,0,Yes,...,1,0,38080,152100,4,98180.000000,490900,643000,38025,0
3,C-10004,남성,78,70,Yes,No,0,Yes,1,Yes,...,1,0,31410,408720,13,112742.857143,1578400,1987120,31440,0
4,C-10005,여성,75,70,Yes,No,0,Yes,3,Yes,...,1,0,8360,24970,3,94500.000000,378000,402970,8323,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,C-17039,여성,20,20,No,No,0,No,0,No,...,4,0,52770,685680,25,64428.000000,1610700,2296380,27427,0
6859,C-17040,남성,40,40,Yes,No,0,Yes,1,Yes,...,1,0,18310,402730,22,101891.304348,2343500,2746230,18305,0
6860,C-17041,남성,22,20,No,No,0,No,0,Yes,...,5,0,21020,42040,14,56600.000000,792400,834440,3002,0
6861,C-17042,남성,21,20,No,No,0,Yes,5,No,...,3,0,2370,160460,79,80107.594937,6328500,6488960,2031,0


In [7]:
##### @Gender : 여성(1), 남성(0)

df_raw['Gender'] = df_raw['Gender'].replace(to_replace='여성', value=1)
df_raw['Gender'] = df_raw['Gender'].replace(to_replace='남성', value=0)

##### @Married : Yes(1), No(0)

df_raw['Married'] = df_raw['Married'].replace(to_replace='Yes', value=1)
df_raw['Married'] = df_raw['Married'].replace(to_replace='No', value=0)

##### @Referrals : Yes(1), No(0)

df_raw['Referrals'] = df_raw['Referrals'].replace(to_replace='Yes', value=1)
df_raw['Referrals'] = df_raw['Referrals'].replace(to_replace='No', value=0)

##### @Dependents: Yes(1), No(0)

df_raw['Dependents'] = df_raw['Dependents'].replace(to_replace='Yes', value=1)
df_raw['Dependents'] = df_raw['Dependents'].replace(to_replace='No', value=0)

##### @PaperlessBilling  : Yes(1), No(0)

df_raw['PaperlessBilling'] = df_raw['PaperlessBilling'].replace(to_replace='Yes', value=1)
df_raw['PaperlessBilling'] = df_raw['PaperlessBilling'].replace(to_replace='No', value=0)

##### @PaymentMethod

df_raw['PaymentMethod'] = df_raw['PaymentMethod'].replace(to_replace='신용카드', value=1)
df_raw['PaymentMethod'] = df_raw['PaymentMethod'].replace(to_replace='계좌이체', value=0)
df_raw['PaymentMethod'] = df_raw['PaymentMethod'].replace(to_replace='이체/메일확인', value=2)

##### @OnlineSecurity

df_raw['OnlineSecurity'] = df_raw['OnlineSecurity'].replace(to_replace='Yes', value=1)
df_raw['OnlineSecurity'] = df_raw['OnlineSecurity'].replace(to_replace='No', value=0)

##### @OnlineBackup  : Yes(1), No(0)

df_raw['OnlineBackup'] = df_raw['OnlineBackup'].replace(to_replace='Yes', value=1)
df_raw['OnlineBackup'] = df_raw['OnlineBackup'].replace(to_replace='No', value=0)

##### @TechSupport  : Yes(1), No(0)

df_raw['TechSupport'] = df_raw['TechSupport'].replace(to_replace='Yes', value=1)
df_raw['TechSupport'] = df_raw['TechSupport'].replace(to_replace='No', value=0)

##### @UnlimitedData : Yes(1), No(0)

df_raw['UnlimitedData'] = df_raw['UnlimitedData'].replace(to_replace='Yes', value=1)
df_raw['UnlimitedData'] = df_raw['UnlimitedData'].replace(to_replace='No', value=0)

In [8]:
df_raw

Unnamed: 0,CustomerId,Gender,Age,AgeGroup,Married,Dependents,noDependents,Referrals,noReferrals,PaperlessBilling,...,SatisScore,TotalExtraDataCharge,AvgRoamCharge,TotalRoamCharge,TenureMonths,AvgCharge,TotalCharge,TotalRevenue,MonthRoamCharge,AvgExtraDataCharge
0,C-10001,1,37,30,1,0,0,1,2,1,...,3,0,47910,431100,21,83590.476190,1755400,2186500,20528,0
1,C-10002,0,46,40,0,0,0,0,0,0,...,5,11300,12090,108710,21,68928.571429,1447500,1567510,5176,538
2,C-10003,0,50,50,0,0,0,0,0,1,...,1,0,38080,152100,4,98180.000000,490900,643000,38025,0
3,C-10004,0,78,70,1,0,0,1,1,1,...,1,0,31410,408720,13,112742.857143,1578400,1987120,31440,0
4,C-10005,1,75,70,1,0,0,1,3,1,...,1,0,8360,24970,3,94500.000000,378000,402970,8323,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,C-17039,1,20,20,0,0,0,0,0,0,...,4,0,52770,685680,25,64428.000000,1610700,2296380,27427,0
6859,C-17040,0,40,40,1,0,0,1,1,1,...,1,0,18310,402730,22,101891.304348,2343500,2746230,18305,0
6860,C-17041,0,22,20,0,0,0,0,0,1,...,5,0,21020,42040,14,56600.000000,792400,834440,3002,0
6861,C-17042,0,21,20,0,0,0,1,5,0,...,3,0,2370,160460,79,80107.594937,6328500,6488960,2031,0


In [9]:
df_raw.drop(['CustomerId', 'StartDate', 'EndDate',
             'EndDateTmp', 'ChurnCategory', 'ChurnReason'], axis=1, inplace=True)

## 데이터 분할

In [10]:
# 설명변수, 목표변수 데이터 구분
df_raw_x = df_raw.drop('ChurnScore', axis=1, inplace=False)
df_raw_y = df_raw['ChurnScore']

In [11]:
# 변수명 저장
v_feature_names = df_raw.columns
# StandardScaler 적용
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_names)
df_scaled.head()

Unnamed: 0,Gender,Age,AgeGroup,Married,Dependents,noDependents,Referrals,noReferrals,PaperlessBilling,PaymentMethod,...,SatisScore,TotalExtraDataCharge,AvgRoamCharge,TotalRoamCharge,TenureMonths,AvgCharge,TotalCharge,TotalRevenue,MonthRoamCharge,AvgExtraDataCharge
0,1.011873,-0.579407,-0.744482,1.145911,-0.518244,-0.467801,1.075871,0.003084,0.831651,0.845272,...,-0.234137,-0.276,1.255637,-0.45453,-0.801476,0.200064,-0.554625,-0.577361,0.07595,-0.149867
1,-0.988266,-0.036602,-0.137883,-0.872668,-0.518244,-0.467801,-0.92948,-0.658333,-1.202427,0.845272,...,1.448463,0.117966,-0.794015,-0.790336,-0.801476,-0.212659,-0.656975,-0.746513,-0.937191,0.076937
2,-0.988266,0.204645,0.468715,-0.872668,-0.518244,-0.467801,-0.92948,-0.658333,0.831651,-0.838158,...,-1.916737,-0.276,0.693156,-0.745141,-1.444109,0.610749,-0.974959,-0.999155,1.230649,-0.149867
3,-0.988266,1.893371,1.681912,1.145911,-0.518244,-0.467801,1.075871,-0.327624,0.831651,-0.838158,...,-1.916737,-0.276,0.311492,-0.477841,-1.103891,1.020684,-0.613462,-0.631846,0.796078,-0.149867
4,1.011873,1.712436,1.681912,1.145911,-0.518244,-0.467801,1.075871,0.333792,0.831651,0.845272,...,-1.916737,-0.276,-1.007449,-0.877561,-1.481911,0.50716,-1.012488,-1.064748,-0.729508,-0.149867


In [12]:
ss_df =pd.concat([df_raw_y, df_scaled], axis=1)

In [13]:
ss_df

Unnamed: 0,ChurnScore,Gender,Age,AgeGroup,Married,Dependents,noDependents,Referrals,noReferrals,PaperlessBilling,...,SatisScore,TotalExtraDataCharge,AvgRoamCharge,TotalRoamCharge,TenureMonths,AvgCharge,TotalCharge,TotalRevenue,MonthRoamCharge,AvgExtraDataCharge
0,65,1.011873,-0.579407,-0.744482,1.145911,-0.518244,-0.467801,1.075871,0.003084,0.831651,...,-0.234137,-0.276000,1.255637,-0.454530,-0.801476,0.200064,-0.554625,-0.577361,0.075950,-0.149867
1,66,-0.988266,-0.036602,-0.137883,-0.872668,-0.518244,-0.467801,-0.929480,-0.658333,-1.202427,...,1.448463,0.117966,-0.794015,-0.790336,-0.801476,-0.212659,-0.656975,-0.746513,-0.937191,0.076937
2,71,-0.988266,0.204645,0.468715,-0.872668,-0.518244,-0.467801,-0.929480,-0.658333,0.831651,...,-1.916737,-0.276000,0.693156,-0.745141,-1.444109,0.610749,-0.974959,-0.999155,1.230649,-0.149867
3,99,-0.988266,1.893371,1.681912,1.145911,-0.518244,-0.467801,1.075871,-0.327624,0.831651,...,-1.916737,-0.276000,0.311492,-0.477841,-1.103891,1.020684,-0.613462,-0.631846,0.796078,-0.149867
4,68,1.011873,1.712436,1.681912,1.145911,-0.518244,-0.467801,1.075871,0.333792,0.831651,...,-1.916737,-0.276000,-1.007449,-0.877561,-1.481911,0.507160,-1.012488,-1.064748,-0.729508,-0.149867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,59,1.011873,-1.604705,-1.351080,-0.872668,-0.518244,-0.467801,-0.929480,-0.658333,-1.202427,...,0.607163,-0.276000,1.533731,-0.189355,-0.650268,-0.339347,-0.602725,-0.547334,0.531244,-0.149867
6859,68,-0.988266,-0.398472,-0.137883,1.145911,-0.518244,-0.467801,1.075871,-0.327624,0.831651,...,-1.916737,-0.276000,-0.438101,-0.484080,-0.763674,0.715220,-0.359134,-0.424403,-0.070755,-0.149867
6860,33,-0.988266,-1.484082,-1.351080,-0.872668,-0.518244,-0.467801,-0.929480,-0.658333,0.831651,...,1.448463,-0.276000,-0.283033,-0.859781,-1.066089,-0.559700,-0.874737,-0.946840,-1.080662,-0.149867
6861,59,-0.988266,-1.544393,-1.351080,-0.872668,-0.518244,-0.467801,1.075871,0.995209,-1.202427,...,-0.234137,-0.276000,-1.350203,-0.736433,1.391039,0.102023,0.965524,0.598376,-1.144743,-0.149867


In [14]:
# train/test 분리
ss_df_train, ss_df_test = train_test_split(ss_df, test_size = 0.3, random_state = 42) 
print("train data size : {}".format(ss_df_train.shape))
print("test data size : {}".format(ss_df_test.shape))

train data size : (4804, 29)
test data size : (2059, 29)


In [18]:
ss_df_train.columns

Index(['ChurnScore', 'Gender', 'Age', 'AgeGroup', 'Married', 'Dependents',
       'noDependents', 'Referrals', 'noReferrals', 'PaperlessBilling',
       'PaymentMethod', 'OnlineSecurity', 'OnlineBackup', 'TechSupport',
       'UnlimitedData', 'AvgDownloadGB', 'ChurnLabel', 'ChurnScore',
       'CustomerLTV', 'SatisScore', 'TotalExtraDataCharge', 'AvgRoamCharge',
       'TotalRoamCharge', 'TenureMonths', 'AvgCharge', 'TotalCharge',
       'TotalRevenue', 'MonthRoamCharge', 'AvgExtraDataCharge'],
      dtype='object')

In [21]:
# 선형 회귀 분석 : formula(y ~ X1 + X2 + ...) 이용. 상수항(Intercept) 자동 추가. 코드 중간에서 줄 바꿈 “\” 추가
reg_ss_model = smf.ols(formula = "ChurnScore ~ Gender + Age + AgeGroup + Married + Dependents + noDependents + Referrals  \
                                + noReferrals + PaperlessBilling + PaymentMethod + OnlineSecurity + OnlineBackup + TechSupport  \
                                + UnlimitedData + AvgDownloadGB + ChurnLabel + CustomerLTV + SatisScore + TotalExtraDataCharge  \
                                + AvgRoamCharge + TotalRoamCharge + TenureMonths + AvgCharge + TotalCharge + TotalRevenue + MonthRoamCharge  \
                                + AvgExtraDataCharge" , data = ss_df_train)
# 적합
reg_ss_result = reg_ss_model.fit()
print(reg_ss_result.summary())

ValueError: endog has evaluated to an array with multiple columns that has shape (4804, 2). This occurs when the variable converted to endog is non-numeric (e.g., bool or str).