In [None]:
import nltk, re
from nltk.stem.snowball import EnglishStemmer
from sklearn import metrics, model_selection
from sklearn.metrics import f1_score, log_loss
import xgboost as xgb

df=pd.read_csv('spelling_train.csv')


In [None]:
df['title_split'] = df['question_title'].apply(nltk.word_tokenize)
df['body_split'] = df['question_body'].apply(nltk.word_tokenize)
df['answer_split'] = df['answer'].apply(nltk.word_tokenize)

unique_words_title = {word for sentence in df.title_split.values for word in sentence}
unique_words_body = {word for sentence in df.body_split.values for word in sentence}
unique_words_answer = {word for sentence in df.answer_split.values for word in sentence}

unique_words=unique_words_title|unique_words_body|unique_words_answer

stopwords = nltk.corpus.stopwords.words('english') + ['']

stemmer = EnglishStemmer()

stemmer_dict = {u: stemmer.stem(u) for u in unique_words}

df['title_stemmed'] = df['title_split'].apply(lambda x: [stemmer_dict[y] for y in x if re.sub('[^a-z]+','',y.lower()) not in stopwords])


df['body_stemmed'] = df['body_split'].apply(lambda x: [stemmer_dict[y] for y in x if re.sub('[^a-z]+','',y.lower()) not in stopwords])


df['answer_stemmed'] = df['answer_split'].apply(lambda x: [stemmer_dict[y] for y in x if re.sub('[^a-z]+','',y.lower()) not in stopwords])

In [None]:

def gen_word_features(df,word):
    df[f'title_n_{word}']=df.title_stemmed.apply(lambda x: x.count(word))
    df[f'body_n_{word}']=df.body_stemmed.apply(lambda x: x.count(word))
    df[f'answer_n_{word}']=df.answer_stemmed.apply(lambda x: x.count(word))
    df[f'n_{word}']=df[f'title_n_{word}']+df[f'body_n_{word}']+df[f'answer_n_{word}']

In [None]:
words=['mean','definit','syllabl','pronunci','pronounc','sound','grammar','grammat','noun','pronoun','verb','adject','adverb','preposit','conjunct']

for word in words:
    gen_word_features(df, word)

In [None]:
classes = {
    0: 0,
    1/3: 1,
    2/3: 2
}

train_y = df['question_type_spelling'].map(classes)
train_id = df['qa_id'].values

cols_to_drop = ['qa_id', 'question_title', 'question_body', 'answer', 'category', 'host']

cols_to_drop += ['title_split','body_split', 'answer_split', 'title_stemmed', 'body_stemmed', 'answer_stemmed']

train_X = df.drop(cols_to_drop+['question_type_spelling'], axis=1)
# test_X = test_df.drop(cols_to_drop, axis=1)


In [None]:
# 自定义F1评价函数
def f1_score_vail(pred, data_vail):
    labels = data_vail.get_label()
    print("labels.shape: ",labels.shape)
    print("pred.shape: ", pred.shape)
    score_vail = f1_score(y_true=labels, y_pred=pred, average='macro')      # xgb的predict输出即为对应的label
    return '1-f1_score', 1-score_vail   # xgb目标是将目标指标降低

In [None]:


def runXGB(train_X, train_y, valid_X, valid_y,weights=None, seed_val=2019, child=1, colsample=0.3):
    """
    test_X是验证集的特征, test_y是验证集的标签, text_X2是测试集的特征
    """
    param = {}
    # param['objective'] = 'multi:softprob'
    param['objective'] = 'multi:softmax'
    param['eta'] = 0.1  # leanrning rate
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = 3
    # param['eval_metric'] = "mlogloss"
    param['eval_metric'] = "auc"
    param['min_child_weight'] = child
    # param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())

    if weights is not None:
        xgtrain = xgb.DMatrix(train_X, label=train_y, weight=weights)
    else:
        xgtrain = xgb.DMatrix(train_X, label=train_y)

  
    xgvalid = xgb.DMatrix(valid_X, label=valid_y)
    # watchlist = [ (xgtrain,'train'), (xgvalid, 'valid') ]
    watchlist = [(xgvalid, 'valid') ]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50,feval=f1_score_vail, verbose_eval=20)  # 加入weights参数

    # model.predict返回的是分布(本任务集对应三分类, 所以shape是[sample_num, 3])
    pred_test_y = model.predict(xgvalid, ntree_limit = model.best_ntree_limit)
 
    return pred_test_y, model



In [None]:
# 处理类别不平衡问题， 加入样本权重 weights 列. 标签0有303个, 1有7个, 2有4个
weight_0 = 4/303
weight_1 = 4/7
weight_2 = 1.0

weight_map={
    0: weight_0,
    1/3: weight_1,
    2/3: weight_2
}

weights = df['question_type_spelling'].map(weight_map)


In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2019)
cv_scores = []
# pred_full_test = 0
# “3”指的是类别数, 因为要将对训练集的预测结果放到这里，由于每个样本得到一个分布所以shape: [sample_num, 3]
pred_train = np.zeros([df.shape[0], 3])  
for dev_index, val_index in kf.split(train_X):
    # 这里的dev其实是训练集, val是验证集
    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    # pred_val_y是对验证集(即所有训练数据的一部分)的预测结果, pred_test_y是对测试集的预测结果
    # w=weights.loc[dev_index]
    # pred_val_y, model = runXGB(dev_X, dev_y, val_X, val_y, w)
    pred_val_y, model = runXGB(dev_X, dev_y, val_X, val_y)
    # pred_full_test = pred_full_test + pred_test_y
    # 把对验证集(即所有训练数据的一部分)的预测结果放入pred_train中, N折都放入后就得到对所有训练数据的预测结果
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(f1_score(val_y, pred_val_y,labels=[0,1,2]))
print("cv scores : ", cv_scores)

In [None]:

class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y):
        X = np.array(X)
        y = np.array(y)
        no_class = len(np.unique(y))

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_proba = np.zeros((X.shape[0], no_class))
        
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
 
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                f1 = f1_score(Y_valid, valid_pred, average='macro')
                
                recall_scores[i][j] = recall
                f1_scores[i][j] = f1
                
                train_pred[valid_idx, i] = valid_pred
                
                ## Probabilities
                valid_proba = clf.predict_proba(X_valid)
                train_proba[valid_idx, :] = valid_proba
                
                print( "Model- {} and CV- {} recall: {}, f1_score: {}".format(i, j, recall, f1))
            
        return train_proba, train_pred