In [1]:
# 导入相关库
import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


# 数据读取转换
train = pd.read_csv('../应用类型识别挑战赛公开数据/train.csv')
lb = LabelEncoder()
train['label'] = lb.fit_transform(train['label'])

# 避免出现类别3只存在于验证集的情况
tmp = pd.DataFrame(np.repeat(train[train['label']==3].values, 1, axis=0))
tmp.columns = ['id', 'name', 'description', 'label']
train = pd.concat([train, tmp]).reset_index(drop=True)
train['label'] = train['label'].astype('int')

test = pd.read_csv('../应用类型识别挑战赛公开数据/test.csv')
test['id'] += 10000
data = pd.concat([train, test]).reset_index(drop=True)

# 构造name + description
data['text'] = data['name'] + data['description']
data.head()



Unnamed: 0,id,name,description,label,name_len,description_len,text
0,0,14717598 14854817 15697796 15706258 47 1478313...,14782903 14925723 14782910 14851258 14853009 1...,2.0,16,77,14717598 14854817 15697796 15706258 47 1478313...
1,1,99 126 108,100 117 106,9.0,3,3,99 126 108100 117 106
2,2,14858120 14720698 88 14790566 14859144 1570624...,14854577 14859690 14847406 14718849 14850444 1...,4.0,13,8,14858120 14720698 88 14790566 14859144 1570624...
3,3,14858412 14720387 14782100 15710881 88 110 104...,14858412 14720387 14782100 15710881 110 104 11...,6.0,10,19,14858412 14720387 14782100 15710881 88 110 104...
4,4,14783417 14719931 70 105 99 117 104 110 99,14783417 14719931 14720179 14783914,16.0,9,4,14783417 14719931 70 105 99 117 104 110 991478...


In [2]:
# tfidf
title_tfidf_vector = TfidfVectorizer().fit(
    data['name'].tolist())
desc_tfidf_vector = TfidfVectorizer().fit(
    data['description'].tolist())
total_tfidf_vector = TfidfVectorizer().fit(
    data['text'].tolist())

def create_csr_mat_input(title_list, desc_list, total_list):
    return sp.hstack((title_tfidf_vector.transform(title_list),
                      desc_tfidf_vector.transform(desc_list),
                      total_tfidf_vector.transform(total_list),
                      ),
                     format='csr')

tfidf_input = create_csr_mat_input(data['name'], data['description'], data['text'])

In [3]:
# 模型训练与预测
def train_model(X_train, X_test, features, y, seed=2021, save_model=False):

    KF = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
    oof_lgb = np.zeros((X_train.shape[0], 19))
    predictions_lgb = np.zeros((X_test.shape[0], 19))

    for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train, y.values)):

        clf = SGDClassifier(random_state=1017, loss='log')
        clf.fit(X_train[trn_idx], y.iloc[trn_idx])
        oof_lgb[val_idx] = clf._predict_proba_lr(X_train[val_idx])
        predictions_lgb += clf._predict_proba_lr(X_test) / 5

    print("F1 score micro: {}".format(f1_score(y, np.argmax(oof_lgb, axis=1), average='micro')))
    print("F1 score macro: {}".format(f1_score(y, np.argmax(oof_lgb, axis=1), average='macro')))
    return oof_lgb, predictions_lgb

train = data[~data['label'].isna()].reset_index(drop=True)
test = data[data['label'].isna()].reset_index(drop=True)
y = train['label']

train_len = train.shape[0]
test_len = test.shape[0]
features = [i for i in train.columns if i not in ['id', 'name', 'description', 'label', 'text']]
seeds = [2021]
pred = []
for seed in seeds:
    oof_lgb, predictions_lgb = train_model(tfidf_input[:train_len], tfidf_input[train_len:], features, y, seed)
    pred.append(predictions_lgb)



F1 score micro: 0.7123809523809524
F1 score macro: 0.5445154709562947


In [4]:
# 生成提交文件
test['label'] = np.argmax(np.mean(pred, axis=0), axis=1)
test['label'] = lb.inverse_transform(test['label'])
test['id'] -= 10000
test[['id', 'label']].to_csv('sub_base.csv', index=False)
test[['id', 'label']].head()

Unnamed: 0,id,label
0,0,14786237 15697082 14722731 14924977
1,1,15630486 15702410 14718849 15709093
2,2,14847385 14844587 14848641 14847398
3,3,14924216 14781104 14717848 14791612
4,4,14794687 14782344
