# 基于机器学习的文本分类
## 学习目标
学会TF-IDF的原理和使用
使用sklearn的机器学习模型完成文本分类

**TF-IDF**

TF-IDF 分数由两部分组成：第一部分是词语频率（Term Frequency），第二部分是逆文档频率（Inverse Document Frequency）。其中计算语料库中文档总数除以含有该词语的文档数量，然后再取对数就是逆文档频率。

TF(t)= 该词语在当前文档出现的次数 / 当前文档中词语的总数

IDF(t)= log_e（文档总数 / 出现该词语的文档总数）

In [13]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier  # XGBoost分类器
from sklearn.metrics import f1_score

In [14]:
# 数据读取
train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=None)
test_df = pd.read_csv('./data/test_a.csv', sep='\t')

In [None]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,3),
    max_features=10000)
# 构建词汇表以及idf值，这里同时生成训练集的VSM矩阵
tfidf.fit(pd.concat([train_df['text'], test_df['text']]))
train_word_features = tfidf.transform(train_df['text'])
test_word_features = tfidf.transform(test_df['text'])

In [None]:
X_train = train_word_features
y_train = train_df['label']
X_test = test_word_features

## SVM

In [None]:
KF = KFold(n_splits=10, random_state=7)
clf = LinearSVC()
test_pred = np.zeros((X_test.shape[0], 1), int)  # 存储测试集预测结果 行数：len(X_test) ,列数：1列
for KF_index, (train_index,valid_index) in enumerate(KF.split(X_train)):
    print('第', KF_index+1, '折交叉验证开始...')
    # 训练集划分
    x_train_, x_valid_ = X_train[train_index], X_train[valid_index]
    y_train_, y_valid_ = y_train[train_index], y_train[valid_index]
    # 模型构建
    clf.fit(x_train_, y_train_)
    # 模型预测
    val_pred = clf.predict(x_valid_)
    print("LinearSVC准确率为：",f1_score(y_valid_, val_pred, average='macro'))
    # 保存测试集预测结果
    test_pred = np.column_stack((test_pred, clf.predict(X_test)))  # 将矩阵按列合并
# 取测试集中预测数量最多的数
preds = []
for i, test_list in enumerate(test_pred):
    preds.append(np.argmax(np.bincount(test_list)))
preds = np.array(preds)

In [None]:
submission = pd.read_csv('./data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('./output/TF-IDF+LinearSVC_submission.csv', index=False)

## xgboost

In [None]:
KF = KFold(n_splits=10, random_state=7)
xg = XGBRegressor()
test_pred = np.zeros((X_test.shape[0], 1), int)  # 存储测试集预测结果 行数：len(X_test) ,列数：1列
for KF_index, (train_index,valid_index) in enumerate(KF.split(X_train)):
    print('第', KF_index+1, '折交叉验证开始...')
    # 训练集划分
    x_train_, x_valid_ = X_train[train_index], X_train[valid_index]
    y_train_, y_valid_ = y_train[train_index], y_train[valid_index]
    # 模型构建
    xg.fit(x_train_, y_train_)
    # 模型预测
    val_pred = xg.predict(x_valid_)
    print("LinearSVC准确率为：",f1_score(y_valid_, val_pred, average='macro'))
    # 保存测试集预测结果
    test_pred = np.column_stack((test_pred, xg.predict(X_test)))  # 将矩阵按列合并
# 取测试集中预测数量最多的数
preds = []
for i, test_list in enumerate(test_pred):
    preds.append(np.argmax(np.bincount(test_list)))
preds = np.array(preds)

In [None]:
submission = pd.read_csv('./data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('./output/TF-IDF+XGBoost_submission.csv', index=False)