In [20]:
import pandas as pd
import numpy as np
import time
import jieba
# 导入所需模块
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from xgboost.sklearn import XGBClassifier
from tqdm import tqdm

# 读取训练测试集

In [53]:
def read_all_data():
    train_data = pd.read_csv("./input/Train_DataSet.csv")
    test_data = pd.read_csv("./input/Test_DataSet.csv")
    return train_data,test_data
    
def read_title_data():
    train_x = pd.read_csv("./input/data/Train_DataSet_title_content.csv")
    train_data = train_x.drop(["content"],axis = 1)
    train_data = train_data.rename(columns={"title":"text"})
    #train_data.dropna(axis=0,inplace = True)
    print(train_data.info())
    
    test_data = pd.read_csv("./input/data/Test_DataSet.csv")
    test_data.drop(["content"],axis=1,inplace = True)
    test_data = test_data.rename(columns={"title":"text"})
    print(test_data.info())
    return train_data,test_data


train_data,test_data = read_title_data()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7355 entries, 0 to 7354
Data columns (total 3 columns):
id       7355 non-null object
text     7354 non-null object
label    7355 non-null int64
dtypes: int64(1), object(2)
memory usage: 172.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7356 entries, 0 to 7355
Data columns (total 2 columns):
id      7356 non-null object
text    7356 non-null object
dtypes: object(2)
memory usage: 115.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7355 entries, 0 to 7354
Data columns (total 3 columns):
id       7355 non-null object
text     7354 non-null object
label    7355 non-null int64
dtypes: int64(1), object(2)
memory usage: 172.5+ KB


In [54]:
train_data.head()

Unnamed: 0,id,text,label
0,7a3dd79f90ee419da87190cff60f7a86,问责领导(上黄镇党委书记张涛，宣国才真能一手遮天吗？),2
1,7640a5589bc7486ca199eeeb38af79dd,江歌事件:教会孩子，善良的同时更要懂得保护自己!,1
2,8c5bda93e4ba401f90a0faa5b28fe57f,"绝味鸭脖广告""开黄腔""引众怒""双11""这么拼值吗?",2
3,1aa777fed31a4b8a9d866f05b5477557,央视曝光!如东一医药企业将槽罐车改成垃圾车，夜间偷排高浓度废水,2
4,6c67ac55360340258e157f3710ebae6c,恶劣至极，央视都曝光了!南通如东一医药企业将槽罐车改成洒水车，夜间偷排高浓度废水丢大发了!,2


### 分词

In [55]:
# jieba进行分词
def train_word_cut(df_train):
    return " ".join(jieba.cut(df_train))

def test_word_cut(df_test):
    return " ".join(jieba.cut(df_test))

def get_custom_stopwords(stop_words_file):
    with open(stop_words_file, 'r', encoding='utf-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

stop_words_file = "./input/stop_word/stopwordsHIT.txt"
stopwords = get_custom_stopwords(stop_words_file)

In [56]:
import warnings
warnings.filterwarnings('ignore')
# 训练数据分词
train_data["text"] = train_data["text"].astype(str)
train_data["text"] = train_data.text.apply(train_word_cut)
# 测试数据分词
test_data["text"] = test_data["text"].astype(str)
test_data["text"] = test_data.text.apply(test_word_cut)
test_data.text.head() # 查看测试集分词前几行

0                               九江 办好 人民满意 教育
1    中央 第三 生态 环境保护 督察组 转办 我市 第三十 一批 信访件 办理 情况
2                     大雨天 车 被淹 ， 保险公司 该不该 赔 ？
3              英特尔 新 cpu 微 架构 ocean   cove 曝光
4      公安部 侦破 一批 重大 网络 赌博 案件   德州 约局 平台 成 重灾区
Name: text, dtype: object

In [57]:
# 合并训练测试数据
train_test = pd.concat([train_data,test_data],ignore_index=True)
train_test.head()

Unnamed: 0,id,label,text
0,7a3dd79f90ee419da87190cff60f7a86,2.0,问责 领导 ( 上 黄镇 党委书记 张涛 ， 宣国 才 真能 一手遮天 吗 ？ )
1,7640a5589bc7486ca199eeeb38af79dd,1.0,江歌 事件 : 教会 孩子 ， 善良 的 同时 更要 懂得 保护 自己 !
2,8c5bda93e4ba401f90a0faa5b28fe57f,2.0,"绝味 鸭 脖 广告 "" 开 黄腔 "" 引 众怒 "" 双 11 "" 这么 拼值 吗 ?"
3,1aa777fed31a4b8a9d866f05b5477557,2.0,央视 曝光 ! 如东 一 医药企业 将 槽罐车 改成 垃圾车 ， 夜间 偷排 高浓度 废水
4,6c67ac55360340258e157f3710ebae6c,2.0,恶劣 至极 ， 央视 都 曝光 了 ! 南通 如东 一 医药企业 将 槽罐车 改成 洒水车 ...


In [58]:
# tifldf特征提取
train_shape = train_data.shape
max_df = 1.0
min_df = 1
print("TfidfVectorizer")
tf = TfidfVectorizer(ngram_range=(1,2),
                     analyzer='char',
                     token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                    #stop_words=stopwords
                    )
tf_feat = tf.fit_transform(train_test['text'].values)
print('HashingVectorizer')
ha = HashingVectorizer(ngram_range=(1,1),
                       analyzer='char',
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                       stop_words=stopwords
                      )
hash_feat = ha.fit_transform(train_test['text'].values)
data = hstack((tf_feat,hash_feat)).tocsr()#hash_feat,
data

TfidfVectorizer
HashingVectorizer


<14711x1092573 sparse matrix of type '<class 'numpy.float64'>'
	with 1132917 stored elements in Compressed Sparse Row format>

In [66]:
def make_lr():
    clf_lr = LogisticRegression(C=3,class_weight="balanced",penalty="l2",n_jobs=-1,solver="sag")
    return clf_lr

In [67]:
N=10
tf_feat = data
X = tf_feat[:train_shape[0]] # 训练集
y = train_test['label'][:train_shape[0]] # 训练集
test = tf_feat[train_shape[0]:] # 测试集
print(X.shape,y.shape,test.shape)
kf = StratifiedKFold(n_splits=N,random_state=42,shuffle=True)
oof = np.zeros((X.shape[0],3))
oof_test = np.zeros((test.shape[0],3))
for j,(train_in,test_in) in tqdm(enumerate(kf.split(X,y))):
    print('running',j)
    X_train,X_test,y_train,y_test = X[train_in],X[test_in],y[train_in],y[test_in]
    print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
    clf = make_lr().fit(X_train,y_train)
    test_y = clf.predict_proba(X_test)
    oof[test_in] = test_y
    oof_test = oof_test + clf.predict_proba(test)
xx_cv = f1_score(y,np.argmax(oof,axis=1),average='macro')
print(xx_cv)

(7355, 1092573) (7355,) (7356, 1092573)


0it [00:00, ?it/s]

running 0
(6618, 1092573) (6618,) (737, 1092573) (737,)


1it [00:04,  4.16s/it]

running 1
(6618, 1092573) (6618,) (737, 1092573) (737,)


2it [00:08,  4.09s/it]

running 2
(6619, 1092573) (6619,) (736, 1092573) (736,)


3it [00:12,  4.05s/it]

running 3
(6619, 1092573) (6619,) (736, 1092573) (736,)


4it [00:15,  3.96s/it]

running 4
(6620, 1092573) (6620,) (735, 1092573) (735,)


5it [00:19,  3.93s/it]

running 5
(6620, 1092573) (6620,) (735, 1092573) (735,)


6it [00:23,  3.90s/it]

running 6
(6620, 1092573) (6620,) (735, 1092573) (735,)


7it [00:27,  3.82s/it]

running 7
(6620, 1092573) (6620,) (735, 1092573) (735,)


8it [00:30,  3.74s/it]

running 8
(6620, 1092573) (6620,) (735, 1092573) (735,)


9it [00:34,  3.74s/it]

running 9
(6621, 1092573) (6621,) (734, 1092573) (734,)


10it [00:38,  3.78s/it]


0.7113615899930249


In [68]:
np.save("./pred_lr_tfidf_71.txt",oof_test)
result = pd.DataFrame()
result['id'] = test_data['id']
result['label'] = np.argmax(oof_test,axis=1)
print('finish')
#result[['id','label']].to_csv('./submit/base_lr_tfidf_hash_{}_830_3.csv'.format(str(np.mean(xx_cv)).split('.')[1]),index=False)

finish


In [73]:
np.savetxt("./pred_lr_tfidf_71.txt",oof_test)

In [74]:
lr_pred=np.loadtxt('./pred_data.txt')
bert_pred = np.loadtxt("./pred_lr_tfidf_71.txt")
bert_pred

array([[4.56444406, 5.14755844, 0.28799751],
       [2.98541996, 6.27019742, 0.74438262],
       [0.02445452, 5.5291814 , 4.44636409],
       ...,
       [2.04675857, 7.85848834, 0.09475309],
       [3.95136662, 5.69428011, 0.35435327],
       [0.86014997, 5.0085378 , 4.13131223]])

In [77]:
result = pd.DataFrame()
result['id'] = test_data['id']
result['label'] = np.argmax(lr_pred+bert_pred,axis=1)
print('finish')
result[['id','label']].to_csv('./submit/base_lrtfidfhash__bert{}_831_3.csv'.format(str(np.mean(xx_cv)).split('.')[1]),index=False)

finish
