In [2]:
import pandas as pd
import lightgbm as lgb
import jieba
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer


## 加载数据集

In [3]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,tweet_id,content,label
0,tweet_0,@tiffanylue i know i was listenin to bad habi...,0
1,tweet_1,Layin n bed with a headache ughhhh...waitin o...,1
2,tweet_2,Funeral ceremony...gloomy friday...,1
3,tweet_3,wants to hang out with friends SOON!,2
4,tweet_4,@dannycastillo We want to trade with someone w...,3


In [5]:
test.head()

Unnamed: 0,tweet_id,content
0,tweet_0,Re-pinging @ghostridah14: why didn't you go to...
1,tweet_1,@kelcouch I'm sorry at least it's Friday?
2,tweet_2,The storm is here and the electricity is gone
3,tweet_3,So sleepy again and it's not even that late. I...
4,tweet_4,"Wondering why I'm awake at 7am,writing a new s..."


In [6]:
## train

## 思路1 文本分类

基于文本的分类模型



In [6]:
vec = TfidfVectorizer(max_features=80000, ngram_range=(1, 2),
                              min_df=2, max_df=0.96,
                              strip_accents='unicode',
                              norm='l2',
                              token_pattern=r"(?u)\b\w+\b")

In [7]:
vec.fit(pd.concat([train['content'],
                   test['content']],
                  axis=0))

TfidfVectorizer(max_df=0.96, max_features=80000, min_df=2, ngram_range=(1, 2),
                strip_accents='unicode', token_pattern='(?u)\\b\\w+\\b')

In [9]:
X_train=vec.transform(train['content'])
X_train.shape

(30000, 57594)

In [11]:
X_test=vec.transform(test['content'])
X_test.shape

(10000, 57594)

In [14]:
y_train=train['label'].astype(int)
y_train

0        0
1        1
2        1
3        2
4        3
        ..
29995    3
29996    3
29997    3
29998    6
29999    9
Name: label, Length: 30000, dtype: int64

## 训练模型

In [16]:
%%time
params = {
          "objective" : "multiclass",
          "num_class" : 13,
          "num_leaves" : 60,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : -1,
          'num_threads':8,# 进程数 根据机器资源调整
}

 
# 五折交叉验证
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=2019)
oof = np.zeros([len(train),13])
predictions = np.zeros([len(test),13])
 
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
 
    num_round = 1000
    clf = lgb.train(params, 
                    trn_data, 
                    num_round, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)    
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    #print(predictions)



fold n°1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.68632	valid_1's multi_logloss: 2.02506
[200]	training's multi_logloss: 1.475	valid_1's multi_logloss: 2.01967
Early stopping, best iteration is:
[159]	training's multi_logloss: 1.55055	valid_1's multi_logloss: 2.01871
fold n°2
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.7083	valid_1's multi_logloss: 1.96216
[200]	training's multi_logloss: 1.50334	valid_1's multi_logloss: 1.9313
[300]	training's multi_logloss: 1.35937	valid_1's multi_logloss: 1.92867
Early stopping, best iteration is:
[258]	training's multi_logloss: 1.41471	valid_1's multi_logloss: 1.92738
fold n°3
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.71918	valid_1's multi_logloss: 1.93128
[200]	training's multi_logloss: 1.51548	valid_1's multi_logloss: 1.88802
[300]	training's multi_logloss: 1.37168	valid_1's multi_logloss:

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
accuracy_score(y_train, np.argmax(oof,axis=1))

0.3429333333333333

## 提交结果

In [20]:
sub=pd.read_csv('data/submission.csv')
sub['label']=np.argmax(predictions,axis=1)
sub['label'].to_csv('result/sub.csv',index=None)

## 提升思路
- 使用一些深度学习模型，例如word2vec+rnn/lstm对于文本进行分类
- 利用预训练模型进行训练和学习