In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [3]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [4]:
df_q1["OnlineSecurity"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [5]:
# df_q1.unique()
# df_q1.drop_duplicates()
for n_col in df_q1.columns:
    print(n_col, ": ", df_q1[n_col].unique())

OnlineSecurity :  ['No' 'Yes' 'No internet service']
OnlineBackup :  ['Yes' 'No' 'No internet service']
DeviceProtection :  ['No' 'Yes' 'No internet service']
TechSupport :  ['No' 'Yes' 'No internet service']
StreamingTV :  ['No' 'Yes' 'No internet service']
StreamingMovies :  ['No' 'Yes' 'No internet service']


In [6]:
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [7]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service"]
len(df_q1), len(df_q1_sub)

(7032, 5512)

In [8]:
df_q1_sub.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes


In [9]:
df_q1.loc[df_q1["OnlineSecurity"] == "No internet service"]

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
11,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
16,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
21,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
22,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
33,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
...,...,...,...,...,...,...
7006,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7008,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7009,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7019,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [10]:
df_q1_sub = df_q1_sub.replace({"Yes": 1, "No": 0})
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,0,0,0


In [11]:
df_q1_sub["cnt"] = df_q1_sub.sum(axis = 1)
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,0,1,0,0,0,0,1
1,1,0,1,0,0,0,2


In [12]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service"].copy()
df_q1_sub["cnt"] = df_q1_sub.apply(lambda x: sum(x == "Yes"), axis = 1)
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,No,Yes,No,No,No,No,1
1,Yes,No,Yes,No,No,No,2


In [13]:
df_q1_sub["cnt"].value_counts()

3    1117
2    1033
1     966
4     850
0     693
5     569
6     284
Name: cnt, dtype: int64

In [14]:
966 / 284

3.4014084507042255

In [15]:
round(df_q1_sub["cnt"].value_counts()[1] / df_q1_sub["cnt"].value_counts()[6], 1)

3.4

1. No internet service가 한 row에 들어있지 않은 경우
2. No internet service이외에 다른 기타 원소 또한 있는 경우.

In [16]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [17]:
for n_col in df_q1.columns:
    df_q1 = df_q1.loc[df_q1[n_col] != "No internet services", ]

In [18]:
df_q1_u = df_q1.apply(lambda x: x.unique()).reset_index().melt(id_vars = "index")
df_q1_u["value"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [19]:
dia = pd.read_csv("../diamonds.csv")
dia = dia[["cut", "color", "clarity"]].copy()
dia.apply(lambda x: x.unique())

cut         [Ideal, Premium, Good, Very Good, Fair]
color                         [E, I, J, H, F, G, D]
clarity    [SI2, SI1, VS1, VS2, VVS2, VVS1, I1, IF]
dtype: object

In [20]:
dia.apply(lambda x: x.unique()).explode().unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair', 'E', 'I', 'J',
       'H', 'F', 'G', 'D', 'SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1',
       'I1', 'IF'], dtype=object)

In [21]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [22]:
df_q1_sub = df_q1.replace({"No internet service": np.nan, 
                           "No": np.nan})
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,,Yes,,,,
1,Yes,,Yes,,,


In [23]:
df_q1_sub.dropna()

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
15,Yes,Yes,Yes,Yes,Yes,Yes
28,Yes,Yes,Yes,Yes,Yes,Yes
140,Yes,Yes,Yes,Yes,Yes,Yes
153,Yes,Yes,Yes,Yes,Yes,Yes
167,Yes,Yes,Yes,Yes,Yes,Yes
...,...,...,...,...,...,...
6848,Yes,Yes,Yes,Yes,Yes,Yes
6862,Yes,Yes,Yes,Yes,Yes,Yes
6899,Yes,Yes,Yes,Yes,Yes,Yes
6984,Yes,Yes,Yes,Yes,Yes,Yes


### Q2.

In [24]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5


In [25]:
21 // 5

4

In [30]:
round(23 / 5)

5

In [26]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [32]:
df_q2.drop("TotalCharges", axis = 1).corr().round(3)

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


### Q3.

In [3]:
col1 = ["SeniorCitizen", "Partner", "Dependents", "tenure", "MonthlyCharges", "TotalCharges"]
col2 = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingMovies", "PaperlessBilling"]
df_q3 = df[["Churn"] + col1 + col2].copy()
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,No,0,Yes,No,1,29.85,29.85,No,Yes,No,No,No,Yes
1,No,0,No,No,34,56.95,1889.5,Yes,No,Yes,No,No,No


In [4]:
df_q3.dtypes

Churn                object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingMovies      object
PaperlessBilling     object
dtype: object

In [5]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.dtypes

Churn                 int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingMovies      object
PaperlessBilling      int64
dtype: object

In [6]:
df_q3_obj = df_q3[df_q3.dtypes[df_q3.dtypes == "object"].index]
df_q3_obj.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [7]:
df_q3_obj = df_q3.select_dtypes(exclude = "number")
df_q3_obj.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [8]:
df_q3_obj.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [18]:
df_q3 = df_q3.replace({"No internet service": -1})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [47]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7, random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [48]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [50]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:, 0 ])
pred = model_lr.predict(arr_test_nor[:, 1:])
pred[:5]

array([0., 0., 0., 0., 0.])

In [51]:
round(f1_score(y_true = arr_test_nor[:, 0],
               y_pred = pred), 2)

0.55

특정 원소를 제외한 나머지 모든 원소를 일괄로 치환하는 방법

In [9]:
dia = pd.read_csv("../diamonds.csv")
dia.head(1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43


In [12]:
arr_u = dia[["cut", "color", "clarity"]].apply(lambda x: x.unique()).explode().nunique(*)
arr_u

cut            Ideal
cut          Premium
cut             Good
cut        Very Good
cut             Fair
color              E
color              I
color              J
color              H
color              F
color              G
color              D
clarity          SI2
clarity          SI1
clarity          VS1
clarity          VS2
clarity         VVS2
clarity         VVS1
clarity           I1
clarity           IF
dtype: object

In [13]:
ser_repl = pd.Series(np.where(pd.Series(arr_u).isin(["Ideal", "Premium"]), arr_u, -1), 
                     index = arr_u)
ser_repl.to_dict()

{'Ideal': 'Ideal',
 'Premium': 'Premium',
 'Good': -1,
 'Very Good': -1,
 'Fair': -1,
 'E': -1,
 'I': -1,
 'J': -1,
 'H': -1,
 'F': -1,
 'G': -1,
 'D': -1,
 'SI2': -1,
 'SI1': -1,
 'VS1': -1,
 'VS2': -1,
 'VVS2': -1,
 'VVS1': -1,
 'I1': -1,
 'IF': -1}

In [58]:
dia2 = dia.replace(ser_repl)
dia2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,-1,-1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,-1,-1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,-1,-1,-1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,-1,-1,62.4,58.0,334,4.2,4.23,2.63
4,0.31,-1,-1,-1,63.3,58.0,335,4.34,4.35,2.75


In [22]:
df_q3["Partner"] = df[""]

KeyError: ''

In [20]:
df_q3["Partner"] = df_q3["Partner"].apply(lambda x: np.where(x == "Yes", 1, np.where(x == "No", 0 -1)))

ValueError: either both or neither of x and y should be given

In [14]:
xx.apply(lambda x: 1 if x == "Yes" else (0 if x == "No" else -1))

NameError: name 'xx' is not defined

In [None]:
xx.replace(",", regex = True)