# 特征工程

In [16]:
import pandas as pd
train = pd.read_csv('preprocess/train_sample.csv')
test = pd.read_csv('preprocess/test_sample.csv')

text_features = ['appIdAction', 'appIdInstall', 'ct', 'interest1',
 'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
 'kw2', 'kw3', 'marriageStatus', 'os', 'topic1', 'topic2', 'topic3']
cat_features = ['LBS', 'adCategoryId', 'advertiserId', 'age', 'campaignId',
 'carrier', 'consumptionAbility', 'creativeId', 'creativeSize', 'education',
 'gender', 'house', 'productId', 'productType']

truth_test = pd.read_csv('preprocess/test_truth_sample.csv')['label'].values

train_df = pd.DataFrame()
test_df = pd.DataFrame()

# 1.通用特征

In [17]:
# exposure
## 一维曝光度
for feature in cat_features:
    se = train[feature].append(test[feature]).value_counts()
    train_df['exposure_' + feature] = train[feature].map(se).fillna(0).astype(int).values
    test_df['exposure_' + feature] = test[feature].map(se).fillna(0).astype(int).values
    
## 二维曝光度
n = len(cat_features)
for i in range(n-1):
    for j in range(i+1, n):
        col_name = "exposure_"+cat_features[i]+"_and_"+cat_features[j]
        cols = [cat_features[i],cat_features[j]]
        stat = train.append(test).groupby(cols).size().reset_index()
        stat.columns = cols + [col_name]
        
        train_df[col_name] = pd.merge(train[cols],stat,how='left',on=cols)[col_name].fillna(0).astype(int).values
        test_df[col_name] = pd.merge(test[cols],stat,how='left',on=cols)[col_name].fillna(0).astype(int).values
# nunique
num = 0
for i in range(n):
    for j in range(n):
        if i!=j:
            col_name = "nunique_"+cat_features[j]+"_in_"+cat_features[i]
            se = train.append(test).groupby([cat_features[i]])[cat_features[j]].value_counts()
            se = pd.Series(1,index=se.index).sum(level=cat_features[i])
            
            train_df[col_name] = (train[cat_features[i]].map(se)).fillna(value=0).values
            test_df[col_name] = (test[cat_features[i]].map(se)).fillna(value=0).values

# ratio
for i in range(n-1):
    for j in range(i+1, n):
            col_both = "exposure_"+cat_features[i]+"_and_"+cat_features[j]
            
            col_one = "exposure_"+cat_features[j]
            col_name = "ratio_exposure_"+cat_features[i]+"_in_"+cat_features[j]
            train_df[col_name] = train_df[col_both]/train_df[col_one]
            test_df[col_name] = test_df[col_both]/test_df[col_one]
            
            col_one = "exposure_"+cat_features[i]
            col_name = "ratio_exposure_"+cat_features[j]+"_in_"+cat_features[i]
            train_df[col_name] = train_df[col_both]/train_df[col_one]
            test_df[col_name] = test_df[col_both]/test_df[col_one]

# 2.业务特征

In [18]:
# ctr (进阶版本可有贝叶斯平滑）
n_parts = 5
train['part'] = (pd.Series(train.index)%n_parts).values

## 一阶ctr
for co in cat_features:
    col_name = 'ctr_of_'+co
    ctr_train = pd.Series(dtype=float)
    ctr_test = pd.Series(0, index=test.index.tolist())
    for i in range(n_parts):
        se = train[train['part']!=i].groupby(co)['label'].mean()
        ctr_train = ctr_train.append(train[train['part']==i][co].map(se))
        ctr_test += test[co].map(se)
    
    train_df[col_name] = ctr_train.sort_index().fillna(-1).values
    test_df[col_name] = (ctr_test/5).fillna(-1).values

## 二阶ctr
for i in range(n-1):
    for j in range(i+1, n):
        col_name = 'ctr_of_'+cat_features[i]+"_and_"+cat_features[j]
        cols = [cat_features[i], cat_features[j]]
        ctr_train = pd.Series(dtype=float)
        ctr_test = pd.Series(0, index=test.index.tolist())

        for k in range(n_parts):
            stat = train[train['part']!=k].groupby(cols)['label'].mean().reset_index()
            stat.columns = cols + [col_name]
            
            se = pd.merge(train[train['part']==k][cols],stat,how='left',on=cols)[col_name]
            ctr_train = ctr_train.append(pd.Series(se.values, index=train[train['part']==k].index.tolist()))
            ctr_test += pd.merge(test[cols],stat,how='left',on=cols)[col_name]
        train_df[col_name] = ctr_train.sort_index().fillna(-1).values
        test_df[col_name] = (ctr_test/5).fillna(-1).values

In [19]:
train_df.shape

(87988, 574)

In [20]:
test_df.shape

(22660, 574)

# 3.文本特征

In [27]:
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
train_sp = pd.DataFrame()
test_sp = pd.DataFrame()

def change_stop_words(se):
    return se.apply(lambda x: 'stop '+x)


# stop words
for feature in text_features:
    train[feature] = change_stop_words(train[feature])
    test[feature] = change_stop_words(test[feature])

# labelencoder
lab = LabelEncoder()
for feature in cat_features:
    lab.fit(train[feature].append(test[feature]))
    
    train_df['labelencoder_'+feature] = lab.transform(train[feature])
    test_df['labelencoder_'+feature] = lab.transform(test[feature])

# onehot
ohe = OneHotEncoder()
for feature in cat_features:
    ohe.fit(train[feature].append(test[feature]).values.reshape(-1, 1))
    
    arr = ohe.transform(train[feature].values.reshape(-1, 1))
    train_sp = sparse.hstack((train_sp, arr))
    
    arr = ohe.transform(test[feature].values.reshape(-1, 1))
    test_sp = sparse.hstack((test_sp, arr))

# countvectorizer
cntv=CountVectorizer()
for feature in text_features:
    cntv.fit(train[feature].append(test[feature]))
    
    train_sp = sparse.hstack((train_sp, cntv.transform(train[feature])))
    test_sp = sparse.hstack((test_sp, cntv.transform(test[feature])))

# tf-idf
tfd = TfidfVectorizer()
for feature in text_features:
    tfd.fit(train[feature].append(test[feature]))
    
    train_sp = sparse.hstack((train_sp, tfd.transform(train[feature])))
    test_sp = sparse.hstack((test_sp, tfd.transform(test[feature])))

In [28]:
train_sp.shape

(87988, 250053)

In [29]:
test_sp.shape

(22660, 250053)

# 4.特征降维

In [30]:
# 稀疏特征降维 TruncatedSVD

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
svd = TruncatedSVD(n_components=100, n_iter=50, random_state=2020)
svd.fit(sparse.vstack((train_sp, test_sp)))

cols = ['svd_'+str(k) for k in range(100)]

train_svd = pd.DataFrame(svd.transform(train_sp), columns = cols)
test_svd = pd.DataFrame(svd.transform(test_sp), columns = cols)

# 5.存储特征

In [31]:
sparse.save_npz("preprocess/train_sample_sparse.npz",train_sp)
sparse.save_npz("preprocess/test_sample_sparse.npz",test_sp)

train_df.to_csv("preprocess/train_sample_feature.csv", index=False)
test_df.to_csv("preprocess/test_sample_feature.csv", index=False)

train_svd.to_csv("preprocess/train_sample_svd.csv", index=False)
test_svd.to_csv("preprocess/test_sample_svd.csv", index=False)