In [1]:
# 导入相关库
import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


# 数据读取转换
train = pd.read_csv('./data/train.csv')
lb = LabelEncoder()
train['label'] = lb.fit_transform(train['label'])

# 避免出现类别3只存在于验证集的情况
tmp = pd.DataFrame(np.repeat(train[train['label']==3].values, 1, axis=0))
tmp.columns = ['id', 'name', 'description', 'label']
train = pd.concat([train, tmp]).reset_index(drop=True)
train['label'] = train['label'].astype('int')

test = pd.read_csv('./data/test.csv')
test['id'] += 10000
data = pd.concat([train, test]).reset_index(drop=True)

# 构造name + description
data['text'] = data['name'] + data['description']
# data.head()

# tfidf
title_tfidf_vector = TfidfVectorizer().fit(
    data['name'].tolist())

In [2]:
title_tfidf_vector.vocabulary_

{'14717598': 85,
 '14854817': 941,
 '15697796': 1420,
 '15706258': 1536,
 '47': 1668,
 '14783134': 374,
 '14860175': 1060,
 '46': 1667,
 '88': 1703,
 '102': 2,
 '105': 4,
 '99': 1710,
 '117': 16,
 '104': 3,
 '110': 9,
 '126': 20,
 '108': 7,
 '14858120': 985,
 '14720698': 145,
 '14790566': 518,
 '14859144': 1022,
 '15706241': 1529,
 '14847916': 750,
 '75': 1690,
 '114': 13,
 '127': 21,
 '14858412': 1001,
 '14720387': 139,
 '14782100': 337,
 '15710881': 1650,
 '116': 15,
 '14859956': 1049,
 '14846135': 685,
 '14783417': 380,
 '14719931': 124,
 '70': 1686,
 '14859664': 1036,
 '14724236': 190,
 '14782910': 370,
 '14848641': 766,
 '14782384': 351,
 '14716559': 66,
 '14925240': 1143,
 '15642297': 1357,
 '100': 0,
 '115': 14,
 '107': 6,
 '96': 1707,
 '112': 11,
 '14727830': 227,
 '14728347': 241,
 '14779522': 271,
 '14716854': 74,
 '15706291': 1544,
 '14780821': 301,
 '15705739': 1525,
 '14786971': 467,
 '14859960': 1051,
 '14847900': 746,
 '77': 1692,
 '80': 1695,
 '53': 1675,
 '55': 1677,
 