# 利用sprase矩阵进行文本特征提取
## 从原始数据输入到特征文件生成

# 0.引入三方库

In [1]:
import gc
import numpy as np 
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from scipy import sparse
from sklearn.model_selection import KFold

# 1.读取所需数据

In [2]:
train = pd.read_csv('../../data/train.csv')
test =  pd.read_csv('../../data/test.csv')
merchant = pd.read_csv('../../data/merchants.csv')
new_transaction = pd.read_csv('../../data/new_merchant_transactions.csv')
history_transaction = pd.read_csv('../../data/historical_transactions.csv')
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()

0

# 2.做数据预处理

In [3]:
nlp_features = ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']

for co in nlp_features:
    print(co)
    transaction[co] = transaction[co].astype(str)
    temp = transaction[transaction['month_lag']>=0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_new']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction[transaction['month_lag']<0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_hist']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction.groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_all']
    train = pd.merge(train, temp, how='left', on='card_id').fillna("-1")
    test = pd.merge(test, temp, how='left', on='card_id').fillna("-1")

merchant_id
merchant_category_id
state_id
subsector_id
city_id


# 3.进行特征提取

In [5]:
train_x = pd.DataFrame()
test_x = pd.DataFrame()

cntv = CountVectorizer()

tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
    
    
vector_feature =[]
for co in ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']:
    vector_feature.extend([co+'_new', co+'_hist', co+'_all'])
for feature in vector_feature:
    print(feature)
    cntv.fit(train[feature].append(test[feature]))
    train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()
    
    tfv.fit(train[feature].append(test[feature]))
    train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()
sparse.save_npz("../../preprocess/train_nlp.npz", train_x)
sparse.save_npz("../../preprocess/test_nlp.npz", test_x)

merchant_id_new
  (0, 671)	2.0
  (0, 7341)	1.0
  (0, 8891)	2.0
  (0, 12360)	1.0
  (0, 15244)	2.0
  (0, 17983)	1.0
  (0, 19095)	1.0
  (0, 31049)	1.0
  (0, 31873)	1.0
  (0, 33640)	1.0
  (0, 34077)	1.0
  (0, 39625)	1.0
  (0, 42059)	1.0
  (0, 69822)	1.0
  (0, 82960)	1.0
  (0, 83701)	1.0
  (0, 88937)	1.0
  (0, 90005)	1.0
  (0, 93293)	1.0
  (0, 93652)	1.0
  (0, 94751)	2.0
  (0, 106160)	1.0
  (0, 134569)	1.0
  (0, 158194)	1.0
  (0, 186111)	3.0
  :	:
  (201913, 251032)	1.0
  (201914, 27761)	1.0
  (201914, 50688)	1.0
  (201914, 56059)	1.0
  (201914, 85468)	1.0
  (201914, 117317)	1.0
  (201915, 15293)	1.0
  (201915, 43807)	1.0
  (201915, 96109)	1.0
  (201915, 116311)	2.0
  (201915, 145158)	1.0
  (201915, 157615)	1.0
  (201915, 206614)	1.0
  (201915, 232894)	1.0
  (201915, 236548)	1.0
  (201915, 253453)	1.0
  (201916, 671)	1.0
  (201916, 1552)	1.0
  (201916, 26773)	1.0
  (201916, 39951)	1.0
  (201916, 67478)	1.0
  (201916, 101685)	1.0
  (201916, 114184)	1.0
  (201916, 140417)	1.0
  (201916, 25611

In [7]:
train_x.shape

(201917, 1846286)