In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
X_train = pd.read_csv("data/X_train.csv", encoding='cp949')
y_train = pd.read_csv("data/y_train.csv", encoding='cp949')

In [3]:
X_train

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.000000,0.000000,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,,보석,본 점,2,1.500000,0.000000,85
...,...,...,...,...,...,...,...,...,...,...
3495,3495,3175200,3042900,,골프,본 점,1,2.000000,1.000000,0
3496,3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40
3497,3497,75000,75000,,주방용품,창원점,1,1.000000,0.000000,0
3498,3498,1875000,1000000,,화장품,본 점,2,1.000000,0.000000,39


In [4]:
X_total = X_train.shape[0]

# Start

In [5]:
def add_columns():
    for idx in range(0, X_total):
        if int(X_train.loc[idx,['환불금액']]>0) :
            X_train.loc[idx, ['환불여부']] = 1
        else:
            X_train.loc[idx, ['환불여부']] = 0

    X_train['gender'] = y_train['gender']

    return X_train.astype({'환불여부':'int'})

add_columns()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기,환불여부,gender
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17,1,0
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1,1,0
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.000000,0.000000,1,0,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16,0,1
4,4,29050000,24000000,,보석,본 점,2,1.500000,0.000000,85,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3495,3175200,3042900,,골프,본 점,1,2.000000,1.000000,0,0,1
3496,3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40,1,1
3497,3497,75000,75000,,주방용품,창원점,1,1.000000,0.000000,0,0,0
3498,3498,1875000,1000000,,화장품,본 점,2,1.000000,0.000000,39,0,0


In [6]:
def pre_processing():
    global X_train

    # 1. pd.get_dummies
    print("---Get Dummies--")
    df_tmp = pd.get_dummies(X_train['주구매상품'])
    df_tmp['cust_id'] = np.arange(0,3500,1)
    print(df_tmp)
    X_train = pd.merge(X_train, df_tmp, on='cust_id')

    #2. drop columns
    X_train = X_train.drop(['환불금액', '주구매상품', '주구매지점'], axis=1)

    # 2. Scaler
    mms = MinMaxScaler()
    X_train = pd.DataFrame(mms.fit_transform(X_train), columns=X_train.columns)
    
pre_processing()

---Get Dummies--
      가공식품  가구  건강식품  골프  구두  기타  남성 캐주얼  남성 트랜디  남성정장  농산물  ...  차/커피  축산가공  \
0        0   0     0   0   0   1       0       0     0    0  ...     0     0   
1        0   0     0   0   0   0       0       0     0    0  ...     0     0   
2        0   0     0   0   0   0       1       0     0    0  ...     0     0   
3        0   0     0   0   0   1       0       0     0    0  ...     0     0   
4        0   0     0   0   0   0       0       0     0    0  ...     0     0   
...    ...  ..   ...  ..  ..  ..     ...     ...   ...  ...  ...   ...   ...   
3495     0   0     0   1   0   0       0       0     0    0  ...     0     0   
3496     0   0     0   0   0   0       0       0     0    0  ...     0     0   
3497     0   0     0   0   0   0       0       0     0    0  ...     0     0   
3498     0   0     0   0   0   0       0       0     0    0  ...     0     0   
3499     0   0     0   0   0   1       0       0     0    0  ...     0     0   

      침구/수예  캐주얼  커리어 

In [8]:
def make_model():
    global X_train, y_train

    X_train = X_train.to_numpy()
    y_train = y_train.gender
    y_train = y_train.to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

    lg_model = LogisticRegression()
    lg_model.fit(X_train, y_train)

    print("훈련 점수 : {}", lg_model.score(X_train, y_train))
    print("평가 점수 : {}", lg_model.score(X_test, y_test))
    
    rst = lg_model.predict(X_test)
    print(rst)

In [9]:
make_model()

훈련 점수 : {} 1.0
평가 점수 : {} 1.0
[1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1
 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1
 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0
 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1
 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 1
 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0
 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 1 0
 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1
 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0