In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [5]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1 = df_q1.replace({"Yes": 1, "No": 0})
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,0,0,0


In [6]:
df_q1["OnlineSecurity"].unique()

array([0, 1, 'No internet service'], dtype=object)

In [None]:
for n in range(df_q1.shape[1]):
    print(df_q1.columns[n], ": ", df_q1.iloc[:, n].unique())

In [12]:
# df_q1.unique() # AttributeError: 'DataFrame' object has no attribute 'unique'
# df_q1.drop_duplicates()
# df_q1.apply(lambda x: [x.unique()]).explode() # 시험버전의 Pandas에서는 이 코드를 권장
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [15]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service", ]
len(df_q1_sub)

5512

In [16]:
df_q1_sub.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,1,1,1


In [18]:
# df_q1.loc[df_q1["OnlineSecurity"] == "No internet service", ]

In [19]:
df_q1_sub = df_q1.replace("No internet service", np.nan).dropna()
len(df_q1_sub)

5512

In [21]:
df_q1_sub["cnt"] = df_q1_sub.sum(axis = 1)

In [22]:
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,2.0


In [25]:
ser_cnt = df_q1_sub["cnt"].value_counts()
ser_cnt

cnt
3.0    1117
2.0    1033
1.0     966
4.0     850
0.0     693
5.0     569
6.0     284
Name: count, dtype: int64

In [27]:
round(ser_cnt[1] / ser_cnt[6], 1)

3.4

### Q2.

In [29]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5


In [32]:
21 // 5

4

In [None]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

In [37]:
df_q2.iloc[:, [0, 1, 3]].corr().round(3) # 0.999

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


### Q3.

In [None]:
col1 = ["SeniorCitizen", "Partner", "Dependents", "tenure", "MonthlyCharges", "TotalCharges"]
col2 = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingMovies", "PaperlessBilling"]

df_q3 = df[["Churn"] + col1 + col2].copy()

In [39]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})

In [44]:
# df_q3_obj = df_q3.select_dtypes(exclude = "number") # 시험버전에서는 구현되어있으나 버그로 동작 ❌
df_q3_obj = df_q3.loc[:, df_q3.dtypes == "object"]
df_q3_obj.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [45]:
df_q3_obj.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [46]:
df_q3 = df_q3.replace({"No internet service": -1})

In [49]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7, random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [None]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [52]:
arr_train_nor[:1, ]

array([[1.        , 0.        , 0.        , 0.        , 0.08450704,
        0.81116094, 0.07551927, 0.5       , 1.        , 0.5       ,
        0.5       , 1.        , 1.        ]])

In [54]:
pd.DataFrame([model_nor.data_max_,
              model_nor.data_min_],
             columns = df_train.columns)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,1.0,1.0,1.0,1.0,72.0,118.75,8684.8,1.0,1.0,1.0,1.0,1.0,1.0
1,0.0,0.0,0.0,0.0,1.0,18.4,18.8,-1.0,-1.0,-1.0,-1.0,-1.0,0.0


In [55]:
df_train_nor = pd.DataFrame(arr_train_nor, columns = df_train.columns)
df_train_nor.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,1.0,0.0,0.0,0.0,0.084507,0.811161,0.075519,0.5,1.0,0.5,0.5,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.607374,0.006987,0.5,0.5,0.5,0.5,0.5,1.0


In [58]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:, 0])
pred = model_lr.predict(arr_test_nor[:, 1:])
pred[:4]

array([0., 0., 0., 0.])

In [60]:
round(f1_score(y_true = arr_test_nor[:, 0], y_pred = pred), 2)

0.55

### Q. 특정 범주를 제외한 나머지 모든 범주를 지정한 값으로 치환하려면?
※ 색상이 "D" 또는 "H"가 아닌 나머지 모든 범주형 변수의 원소를 -1로 치환

In [79]:
df_dia = pd.read_csv("../diamonds.csv")
df_dia.tail(1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64


In [67]:
# 시험 버전은 리스트를 하나 더 씀.
# df_dia.iloc[:, 1:4].apply(lambda x: [x.unique()]).explode().explode()

ser_u = df_dia.iloc[:, 1:4].apply(lambda x: x.unique()).explode()

In [71]:
ser_repl = pd.Series(np.where(ser_u.isin(["D", "H"]), ser_u, -1),
                     index = ser_u)
ser_repl.to_dict()

{'Ideal': -1,
 'Premium': -1,
 'Good': -1,
 'Very Good': -1,
 'Fair': -1,
 'E': -1,
 'I': -1,
 'J': -1,
 'H': 'H',
 'F': -1,
 'G': -1,
 'D': 'D',
 'SI2': -1,
 'SI1': -1,
 'VS1': -1,
 'VS2': -1,
 'VVS2': -1,
 'VVS1': -1,
 'I1': -1,
 'IF': -1}

In [73]:
df_dia_repl = df_dia.replace(ser_repl)
df_dia_repl.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,-1,-1,-1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,-1,-1,-1,59.8,61.0,326,3.89,3.84,2.31


In [75]:
df_dia_repl.iloc[:, 1:4].apply(lambda x: x.unique())

cut              [-1]
color      [-1, H, D]
clarity          [-1]
dtype: object

In [83]:
def replace_values(x):
    if (x != "D") & (x != "H"):
        return -1

In [84]:
df_dia_repl = df_dia.iloc[:, 1:4].map(replace_values)
df_dia_repl.head(2)

Unnamed: 0,cut,color,clarity
0,-1,-1.0,-1
1,-1,-1.0,-1


In [85]:
df_dia_repl.apply(lambda x: x.unique())

cut               [-1]
color      [-1.0, nan]
clarity           [-1]
dtype: object