In [1]:
import sys
sys.path.append('/home/lipeng/projects/tensorflow')
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from data_parser import DataParser
from models.deep_models import DeepFM
from IPython.core.display import display, HTML
pd.set_option('display.max_rows', 500)      #设置最大行数
pd.set_option('display.max_columns', 200)  #设置最大列数
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
def load_data():
    train_data = pd.read_csv('../data/titanic/train.csv')
    test_data = pd.read_csv('../data/titanic/test.csv')
    
    default_values = {'Age': train_data.Age.mean(), 'Cabin': 'NULL', 'Embarked': 'Q'}
    for k, v in default_values.items():
        train_data[k] = train_data[k].fillna(v)
        test_data[k] = test_data[k].fillna(v)
    return train_data, test_data

In [3]:
numeric_vars = ['Age', 'Fare']
category_vars = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
ignore_vars = ['PassengerId', 'Name', 'Ticket']
train_raw, test_raw = load_data()
train_raw = train_raw[numeric_vars + category_vars + ['Survived']]
test_raw = test_raw[numeric_vars + category_vars]

In [4]:
dp = DataParser(train_raw, test_raw, numeric_vars=numeric_vars, category_vars=category_vars, ignore_vars=ignore_vars)
train, valid = train_test_split(train_raw, test_size=0.3, random_state=0)
Xv_train, Xi_train, y_train = dp.parse(train, label_col='Survived')
Xv_valid, Xi_valid, y_valid = dp.parse(valid, label_col='Survived')

In [6]:
feature_size, field_size = dp.feature_size, dp.field_size
fm = DeepFM(feature_size=feature_size, field_size=field_size, use_fm=True, use_deep=False, embedding_size=5, optimizer_type="adam", loss_type="logloss", epoch=1000)

In [7]:
fm.fit(Xi_train=Xi_train, Xv_train=Xv_train, y_train=y_train, Xi_valid=Xi_valid, Xv_valid=Xv_valid, y_valid=y_valid,
            early_stopping_rounds=50, greater_is_better=True, verbose=5)

[5] [2021-07-10 10:04:45] train-result=0.5503, valid-result=0.5391 
[10] [2021-07-10 10:04:45] train-result=0.5000, valid-result=0.5250 
[15] [2021-07-10 10:04:45] train-result=0.3501, valid-result=0.3045 
[20] [2021-07-10 10:04:45] train-result=0.3975, valid-result=0.3563 
[25] [2021-07-10 10:04:45] train-result=0.4096, valid-result=0.3786 
[30] [2021-07-10 10:04:45] train-result=0.4201, valid-result=0.3836 
[35] [2021-07-10 10:04:45] train-result=0.4502, valid-result=0.3948 
[40] [2021-07-10 10:04:45] train-result=0.4957, valid-result=0.4677 
[45] [2021-07-10 10:04:45] train-result=0.6044, valid-result=0.6151 
[50] [2021-07-10 10:04:45] train-result=0.7004, valid-result=0.7374 
[55] [2021-07-10 10:04:45] train-result=0.7359, valid-result=0.7718 
[60] [2021-07-10 10:04:45] train-result=0.7606, valid-result=0.7939 
[65] [2021-07-10 10:04:45] train-result=0.7729, valid-result=0.7950 
[70] [2021-07-10 10:04:45] train-result=0.7859, valid-result=0.7977 
[75] [2021-07-10 10:04:45] train-re

In [8]:
order1_W = fm.sess.run(fm.weights['W'])
order1_b = fm.sess.run(fm.weights['b'])
order2_W = fm.sess.run(fm.weights['embeddings'])
with open('../stores/fm.model', 'wb') as f:
    pickle.dump(order1_W, f)
    pickle.dump(order1_b, f)
    pickle.dump(order2_W, f)
    pickle.dump((numeric_vars, category_vars, ignore_vars), f)