# 데이터 파악 후 결측치 제거

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

data = pd.read_csv("data/train.csv", encoding="cp949")

int_columns = data.select_dtypes(include='float64').columns
imputer = SimpleImputer(strategy='mean')
data[int_columns] = imputer.fit_transform(data[int_columns])

print(data.isnull().sum())

custid       0
gender       0
총구매액         0
구매건수         0
평균구매가격       0
평균할부개월수      0
구매브랜드종류      0
내점일수         0
수입상품_구매비율    0
주말방문비율       0
가을_구매건수      0
겨울_구매건수      0
봄_구매건수       0
여름_구매건수      0
아침_구매건수      0
저녁_구매건수      0
점심_구매건수      0
주구매코너        0
dtype: int64


# 문자형데이터를 수치형데이터로 변환

In [3]:
from sklearn.preprocessing import LabelEncoder as LE
categorical_columns = ["주구매코너"]

lebel = LE()
for column in categorical_columns:
    data[column] = lebel.fit_transform(data[column])

data.head(5)
data["주말방문비율"].value_counts()

주말방문비율
0.0      440
50.0     345
33.3     264
100.0    234
25.0     190
        ... 
52.1       1
29.7       1
40.5       1
54.2       1
78.3       1
Name: count, Length: 459, dtype: int64

# 남자데이터 업 샘플링

In [5]:
from sklearn.utils import resample
men_data = data[data['gender'] == 1]

men_data_upsampled = resample(men_data, 
                               replace=True,    
                               n_samples=3479,   
                               random_state=42)  

women_data = data[data['gender'] == 0]

balanced_data = pd.concat([men_data_upsampled, women_data], axis=0)

balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data['gender'].value_counts())


gender
0    3479
1    3479
Name: count, dtype: int64


In [7]:
correlation_matrix = balanced_data.corr()
gender_correlation = correlation_matrix["gender"].sort_values(ascending=False)

# 결과 출력
print(gender_correlation)


gender       1.000000
custid       0.298257
아침_구매건수      0.110708
총구매액         0.107826
점심_구매건수      0.107252
구매건수         0.106457
여름_구매건수      0.093124
내점일수         0.092170
구매브랜드종류      0.091867
가을_구매건수      0.091127
봄_구매건수       0.077215
평균구매가격       0.077212
겨울_구매건수      0.071596
주말방문비율       0.021780
저녁_구매건수     -0.013856
평균할부개월수     -0.014801
수입상품_구매비율   -0.022997
주구매코너       -0.037081
Name: gender, dtype: float64


# train_test_split

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = balanced_data.drop(["custid", "gender", "주말방문비율"], axis=1)

Y = balanced_data["gender"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



# 랜덤포레스트

In [11]:
from sklearn.ensemble import RandomForestClassifier as rfc

rf_model = rfc(n_estimators = 100 , random_state = 42)
rf_model = rf_model.fit(X_train , Y_train)
Y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(Y_test , Y_pred_rf)
accuracy_rf

0.8879310344827587

# SVM

In [13]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42)
svm_model.fit(X_train, Y_train)

Y_pred_svm = svm_model.predict(X_test)

accuracy_svm = accuracy_score(Y_test, Y_pred_svm)
accuracy_svm

0.5272988505747126

# K 근접 이웃

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)  # k값을 3으로 설정
knn_model.fit(X_train, Y_train)

Y_pred_knn = knn_model.predict(X_test)

# 정확도 계산
accuracy_knn = accuracy_score(Y_test, Y_pred_knn)

accuracy_knn


0.6408045977011494

# 크로스 벨리데이션 / 랜덤포레스트

In [17]:
from sklearn.model_selection import cross_val_score

model = rfc()

scores = cross_val_score(model, X_train, Y_train ,  cv=5)  
scores

array([0.82944345, 0.80233603, 0.82749326, 0.84456424, 0.8230009 ])

# 크로스 벨리데이션 / k근접 이웃

In [18]:
model = KNeighborsClassifier()

scores = cross_val_score(model, X_train, Y_train ,  cv=5)  
scores

array([0.62387792, 0.63432165, 0.60197664, 0.62713387, 0.60017969])

# ShuffleSplit(무작위 분할)

In [21]:
from sklearn.model_selection import ShuffleSplit
model = rfc()
sscv = ShuffleSplit(test_size=.2, train_size=.8, n_splits=10)
scores = cross_val_score(model, X_train, Y_train, cv=sscv)
scores.mean()

0.832854578096948

In [22]:
from sklearn.model_selection import ShuffleSplit
model = SVC()
sscv = ShuffleSplit(test_size=.2, train_size=.8, n_splits=10)
scores = cross_val_score(model, X_train, Y_train, cv=sscv)
scores.mean()

0.5649012567324955

# 정밀도 재현율도 봐보기

In [23]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.metrics import classification_report
rf_model = rfc(n_estimators = 100 , random_state = 42)
rf_model = rf_model.fit(X_train , Y_train)
Y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(Y_test , Y_pred_rf)

# 정밀도, 재현율, F1-score 출력
print(classification_report(Y_test, Y_pred_rf))


              precision    recall  f1-score   support

           0       0.89      0.88      0.89       695
           1       0.88      0.90      0.89       697

    accuracy                           0.89      1392
   macro avg       0.89      0.89      0.89      1392
weighted avg       0.89      0.89      0.89      1392



### 1. 데이터 파악
### 2. 결측치 확인 후 제거 or 변환 
### 3. 머신러닝을 하기위해선 문자형을 수치형으로 전환
### 4. 예측하고자하는 것에 비율이 맞지않다면 업샘플링 혹은 다른 방법을 생각 (ex 성별)
### 5. 머신러닝방법을 선택 (ex train_test , cross Validation )
### 6. 하이퍼파라미터 조절
### 7. 정확도 정밀도 재현율 f1 - score 보고 최적의 알고리즘 찾기