# 读取数据集

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('./data/train_set.csv')
test = pd.read_csv('./data/test_set.csv')

In [3]:
train.head()

Unnamed: 0,id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12


# TFIDF构建文本特征

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
word_vec = TfidfVectorizer(analyzer='word',
            ngram_range=(1,2),
            min_df=3, 
            max_df=0.9,
            use_idf=True,
            smooth_idf=True, 
            sublinear_tf=True)

In [6]:
train_term_doc = word_vec.fit_transform(train['word_seg'])

In [7]:
test_term_doc = word_vec.transform(test['word_seg'])

In [8]:
train_term_doc.shape[0]

102277

# 构建模型交叉模型

## label转化

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
lb = LabelEncoder()
train['label'] = lb.fit_transform(train['class'].tolist())

In [None]:
train_term_doc,test_term_doc,train['label']

## 定义交叉验证函数

In [12]:
from sklearn.model_selection import KFold

In [13]:
#10折cv

In [14]:
kf = KFold(n_splits=10, shuffle=True, random_state=666)

In [15]:
#[102277,19]*5
#

In [16]:
train_matrix = np.zeros((train.shape[0],19)) #记录验证集的概率
##!!!!!


test_pre_matrix = np.zeros((10,test.shape[0],19)) #将5轮的测试概率分别保存起来
cv_scores=[] #每一轮线下的验证成绩

In [17]:
test_pre_matrix.shape,train_matrix.shape

((10, 102277, 19), (102277, 19))

In [18]:
from sklearn.metrics import f1_score
def cal_macro_f1(y_true,y_pred):
    score = f1_score(y_true,y_pred,average='macro')
    return score

In [19]:
from sklearn.linear_model import LogisticRegression


for i,(train_index,eval_index) in enumerate(kf.split(train_term_doc)):
    print(len(train_index),len(eval_index))
    
    
    #训练集
    X_train = train_term_doc[train_index]
    y_train = train['label'][train_index]
    
    #验证集
    X_eval = train_term_doc[eval_index]
    y_eval = train['label'][eval_index]
    
    model = LogisticRegression(C=4, dual=True) 
    model.fit(X_train,y_train)
    
    ####对于验证集进行预测
    eval_prob = model.predict_proba(X_eval)
    train_matrix[eval_index] = eval_prob.reshape((X_eval.shape[0], 19))#array
    
    eval_pred = np.argmax(eval_prob,axis=1)
    eval_pred = lb.inverse_transform(eval_pred)
    score = cal_macro_f1(lb.inverse_transform(y_eval),eval_pred)
    cv_scores.append(score)
    print("validation score is",score)
    
    ###对于测试集进行预测
    test_prob = model.predict_proba(test_term_doc)
    test_pre_matrix[i,:,:] = test_prob.reshape((test_term_doc.shape[0], 19))




validation score is 0.7613543467100502
validation score is 0.7768677667455569
validation score is 0.7728598168109526
validation score is 0.7755113436928625
validation score is 0.7760066478306171
validation score is 0.7744242858674689
validation score is 0.7728841713152601
validation score is 0.7713283142117211
validation score is 0.7683025318292591
validation score is 0.7718469745843946


In [26]:
train_matrix.shape

(102277, 19)

In [27]:
all_pred = np.argmax(train_matrix,axis=1)
all_pred = lb.inverse_transform(all_pred)
score = cal_macro_f1(lb.inverse_transform(train['label']),all_pred)
print("all validation score is",score)

all validation score is 0.7722242367841076


In [28]:
test_pre_matrix.shape

(10, 102277, 19)

# 提交结果

In [29]:
test_pred = test_pre_matrix.mean(axis=0)

In [31]:
test_pred.shape

(102277, 19)

In [32]:

test_pred = np.argmax(test_pred,axis=1)
test_pred = lb.inverse_transform(test_pred)
test['class'] = test_pred
test[["id","class"]].to_csv("submission_baseline_cv.csv",index=False,header=True,encoding='utf-8')