# *Data Load*

In [1]:
from pshmodule.utils import filemanager as fm

In [2]:
df = fm.load('../../data/df_users_whole_info.pickle')

extension : .pickle
Loaded 396808 records from ../../data/df_users_whole_info.pickle


In [3]:
df.head(10)

Unnamed: 0,user_uuid,marketing_channel,sex,age_group,date_joined,os,visits,revenue
0,05b0058df377da90c21c585649cbf415,0,0,1,4,0,23,304.0
1,28b7062943065c84bb902a0c1d018398,1,0,0,4,0,5,195.0
2,f8c30b791fbe2d6c80585b20fb6fdbe0,0,0,1,4,0,1,78.0
3,91cd8505d196bea35d48f4f1eacbc106,0,1,1,4,1,18,202.0
4,4b34dfb277abb6ad04dcf6968fe416d0,0,0,1,4,0,9,197.0
5,cca426a2b08ffba17a0bb3a6d5e704b2,0,1,2,3,1,1,906.0
6,feb4fa108eaf1ce3eeebf551deb2fb40,0,0,1,4,0,2,64.0
7,654451a4952dc807938fba26c40f0513,0,1,0,4,1,6,49.0
8,c07f1228891d03c5ea029366a9886be1,1,1,1,4,1,18,149.0
9,ce49bb23b31252c914364fcd7c48111e,1,1,3,4,1,4,825.0


# 

# *Data Split*

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
df = df[['sex', 'age_group', 'date_joined', 'os', 'visits', 'revenue', 'marketing_channel']]

In [6]:
df.marketing_channel.value_counts()

0    321485
1     75323
Name: marketing_channel, dtype: int64

#### marketing_channel 불균형 수정

In [7]:
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df_zero = df[df.marketing_channel.apply(lambda x: x == 0)]
df = df[~df.marketing_channel.apply(lambda x: x == 0)]

In [9]:
print(len(df_zero))
print(len(df))

321485
75323


In [10]:
df_zero = df_zero[:80000]
df = pd.concat([df_zero, df])

In [11]:
df.marketing_channel.value_counts()

0    80000
1    75323
Name: marketing_channel, dtype: int64

# 

In [12]:
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

In [13]:
print(len(df))
print(len(train_data))
print(len(test_data))

155323
139790
15533


In [14]:
train_data.marketing_channel.value_counts()

0    72054
1    67736
Name: marketing_channel, dtype: int64

In [15]:
X_train = train_data.iloc[:,:6]
y_train = train_data.iloc[:,-1]

X_test = test_data.iloc[:,:6]
y_test = test_data.iloc[:,-1]

In [16]:
print(f"X_train : {X_train.columns}")
print(f"y_train : {y_train[:3]}")

print(f"X_test : {X_test.columns}")
print(f"y_test : {y_test[:3]}")

X_train : Index(['sex', 'age_group', 'date_joined', 'os', 'visits', 'revenue'], dtype='object')
y_train : 315163    1
46153     0
20981     0
Name: marketing_channel, dtype: int64
X_test : Index(['sex', 'age_group', 'date_joined', 'os', 'visits', 'revenue'], dtype='object')
y_test : 311259    1
3387      0
216051    1
Name: marketing_channel, dtype: int64


# 

# *SVM*

### train

In [17]:
from sklearn.svm import SVC
from joblib import dump

In [18]:
svm_clf = SVC()
svm_clf = svm_clf.fit(X_train, y_train)

In [19]:
dump(svm_clf, '../result/svm_100000.joblib') 

['../result/svm_100000.joblib']

### load

In [17]:
from joblib import load
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [18]:
svm_clf = load('../result/svm_80000.joblib')

In [19]:
pred = svm_clf.predict(np.array([[0, 0, 4, 0, 5, 195.0]]))
print(pred)

[1]




In [20]:
y_pred = svm_clf.predict(X_test)

In [21]:
origin_zero = 0
origin_one = 0
for i in y_test.tolist():
    if i == 0:
        origin_zero += 1
    elif i == 1:
        origin_one += 1

pred_zero = 0
pred_one = 0
for i in y_pred.tolist():
    if i == 0:
        pred_zero += 1
    elif i == 1:
        pred_one += 1
        
print(f"origin zero : {origin_zero}")
print(f"predict zero : {pred_zero}")
print(f"origin one : {origin_one}")
print(f"predict one : {pred_one}")

origin zero : 7946
predict zero : 6190
origin one : 7587
predict one : 9343


In [22]:
print(f"y_test : {y_test.tolist()[:40]}")
print(f"y_pred : {y_pred.tolist()[:40]}")

y_test : [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0]
y_pred : [1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0]


In [23]:
print('Accuacy : {: .5f}'.format(accuracy_score(y_test, y_pred)))
print('F1 Score : {: .5f}'.format(f1_score(y_test, y_pred, average='macro')))

Accuacy :  0.61231
F1 Score :  0.60915


# 

### 210000
Accuacy :  0.73774<br>
F1 Score :  0.42454<br><br>
### 100000
Accuacy :  0.61592<br>
F1 Score :  0.60391<br><br>
### 80000
Accuacy :  0.62570<br>
F1 Score :  0.62267

# 