In [None]:
#解决方案包括三种建模：1、基于TFIDF统计特征的线性模型预测；
                #2、基于与比赛数据相似的特定文本数据的distilRoBERTa的文本二分类模型预测；
                #3、基于大规模AI生成数据和来自于网络开源数据集（人为写作）构成的多样性数据文本，使用deberta-v3-small训练二分类模型预测。

### 基于TFIDF的线性模型：
* 使用测试集文本预训练分词器，将所得分词器tokenizer用来分词（token）所有文本（包括训练集与测试集）
* 分词完成后使用TFIDF获取Ngram（3,5）文本统计特征
* 将上述特征输入MultinomialNB与SGDClassifier构成的融合分类器训练，然后预测结果
* 伪标签操作：将测试集部分预测结果置信度较高部分加入训练集一起训练，重复上述操作得到线性分类器最终结果

#### 数据集
* https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset
* https://www.kaggle.com/datasets/alejopaullier/argugpt
</br>将上述数据移除相似度大于0.9的样本后得到

In [None]:
%%writefile lin_infer.py
import sys
import gc
from scipy.sparse import vstack
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
import glob
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from tokenizers.normalizers import (Sequence, Lowercase, NFD, 
                                   StripAccents)

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
#论坛开源数据
train = pd.read_csv("/kaggle/input/llm-detect-data/train1.csv", sep=',')

#https://www.kaggle.com/competitions/llm-detect-ai-generated-text
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')



train['text'] = train['text'].str.strip().replace('\n', '')
test['text'] = test['text'].str.strip().replace('\n', '')

train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

train['label'].value_counts()

VALID_MODE = len(test)==3
#验证数据集
if VALID_MODE:
    valid = pd.read_csv('/kaggle/input/llm-detect-ai-validation2/nonTargetText_llm_slightly_modified_gen.csv')
    valid = valid.dropna().reset_index(drop=True)
    valid['text'] = valid['text'].str.strip().replace('\n', '')
    print(valid.shape)
else:
    valid = pd.read_csv('/kaggle/input/llm-detect-ai-validation2/nonTargetText_llm_slightly_modified_gen.csv')
    valid = valid.dropna().reset_index(drop=True)
    valid['text'] = valid['text'].str.strip().replace('\n', '')
    print(valid.shape)
    #train = pd.concat([train, valid]).drop_duplicates(subset=['text']).reset_index(drop=True, inplace=False)

LOWERCASE = False
VOCAB_SIZE = 4000 

# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]",))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([NFD(), 
])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens, )



# Creating huggingface dataset object
if VALID_MODE:
    dataset = Dataset.from_pandas(valid[['text']])
else:
    dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
raw_tokenizer.model.save('.')

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)


tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text, add_special_tokens=False))


# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text, add_special_tokens=False))

if VALID_MODE: 
    tokenized_texts_valid = []

    for text in tqdm(valid['text'].tolist()):
        tokenized_texts_valid.append(tokenizer.tokenize(text, add_special_tokens=False))
    tokenized_texts_valid_aug = []
    for text in tokenized_texts_valid:
        tokenized_texts_valid_aug.append(text+text[::2]+text[1::2])

def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

tokenized_texts_test_aug = []
for text in tokenized_texts_test:
    tokenized_texts_test_aug.append(text+text[::2]+text[1::2])

tokenized_texts_train_aug = []
for text in tokenized_texts_train:
    tokenized_texts_train_aug.append(text+text[::2]+text[1::2])

# Fitting TfidfVectoizer on test set
min_df = 2
vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             lowercase=False, 
                             sublinear_tf=True, 
                             analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            min_df = 2,
                            token_pattern = None, 
                            strip_accents='unicode'
                                                        )
if VALID_MODE: 
    vectorizer.fit(tokenized_texts_valid_aug)
else:
    vectorizer.fit(tokenized_texts_test_aug)

# Getting vocab
vocab = vectorizer.vocabulary_
# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             lowercase=False, 
                             sublinear_tf=True, 
                             vocabulary=vocab,
                             min_df = 2,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train_aug)
tf_test = vectorizer.transform(tokenized_texts_test_aug)

y_train = train['label'].values

bayes_model = MultinomialNB(alpha=0.023)
sgd_model = SGDClassifier(max_iter=35000, tol=1e-4, loss="modified_huber")

weights = [0.2,0.8,]
 
ensemble = VotingClassifier(estimators=[('mnb',bayes_model),
                                        ('sgd', sgd_model),
                                       ],
                            weights=weights, voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)
gc.collect()

final_preds = ensemble.predict_proba(tf_test)[:,1]

#sub['generated'] = final_preds
#sub.to_csv('sub_linear.csv', index=False)
#sub

if VALID_MODE:
    tf_valid = vectorizer.transform(tokenized_texts_valid_aug)
    y_pred = ensemble.predict_proba(tf_valid)[:,1]
    from sklearn.metrics import roc_auc_score
    print("* valid AUC-ROC score:",roc_auc_score(valid["label"], y_pred))
    low_rank_index = rankdata(y_pred)<0.1*len(y_pred)
    high_rank_index = rankdata(y_pred)>0.9*len(y_pred)
    tf_train = vstack([tf_train, tf_valid[low_rank_index], tf_valid[high_rank_index]])
    y_train = np.concatenate([y_train, sum(low_rank_index)*[0], sum(high_rank_index)*[1]])
    bayes_model = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=25000, tol=1e-4, loss="modified_huber")

    weights = [0.5,0.5,]

    ensemble2 = VotingClassifier(estimators=[('mnb',bayes_model),
                                            ('sgd', sgd_model),
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble2.fit(tf_train, y_train)
    y_pred = ensemble2.predict_proba(tf_valid)[:,1]
    print("* pl valid AUC-ROC score:",roc_auc_score(valid["label"], y_pred))
    final_preds2 = ensemble2.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds2
    sub.to_csv('sub_linear.csv', index=False)
    sub
else:
    low_rank_index = rankdata(final_preds)<0.1*len(final_preds)
    high_rank_index = rankdata(final_preds)>0.9*len(final_preds)
    
    tf_train = vstack([tf_train, tf_test[low_rank_index], tf_test[high_rank_index]])
    y_train = np.concatenate([y_train, sum(low_rank_index)*[0], sum(high_rank_index)*[1]])
    bayes_model = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=25000, tol=1e-4, loss="modified_huber")

    weights = [0.5,0.5,]

    ensemble2 = VotingClassifier(estimators=[('mnb',bayes_model),
                                            ('sgd', sgd_model),
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble2.fit(tf_train, y_train)
    final_preds2 = ensemble2.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds2
    sub.to_csv('sub_linear.csv', index=False)
    sub

In [None]:
!python lin_infer.py

### 基于语言模型的文本二分类模型：

* 训练参考开源代码：https://www.kaggle.com/code/mustafakeser4/train-detectai-distilroberta-0-927

In [None]:
%%writefile distilroberta_infer.py#推理代码
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer
##https://www.kaggle.com/datasets/mustafakeser4/detect-llm-models/versions/9
model_checkpoint = "/kaggle/input/detect-llm-models/distilroberta-finetuned_v5/checkpoint-13542"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model and data to the GPU
model.to(device);
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs
sub.to_csv('sub_nn.csv', index=False)
sub.head()

In [None]:
!python distilroberta_infer.py

### 基于大规模数据的语言模型二分类：

* 数据链接：https://www.kaggle.com/datasets/canming/piles-and-ultra-data
    * 此数据来自huggingface作者开源，主要由以下数据集构成：
    </br>https://huggingface.co/datasets/EleutherAI/the_pile_deduplicated
    </br>https://huggingface.co/datasets/openbmb/UltraFeedback
    </br>https://huggingface.co/datasets/stingning/ultrachat
    </br>https://huggingface.co/datasets/lmsys/lmsys-chat-1m
    </br>数据移除相似度大于0.8
* 训练代码：deberta_train_exp5.py

In [None]:
%%writefile deberta_infer.py#推理代码
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer

model_checkpoint = "/kaggle/input/llm-diverse-model2/LLM7/0120/checkpoint-369284"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model and data to the GPU
model.to(device);
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,1]
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs
sub.to_csv('sub_nn2.csv', index=False)
sub.head()

In [None]:
!python deberta_infer.py

In [None]:
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy.stats import rankdata
p1=pd.read_csv('./sub_linear.csv').sort_values(['id']).reset_index(drop=True)
number_sample = p1.shape[0]
p1['generated'] = rankdata(p1['generated'])/number_sample

p3=pd.read_csv('./sub_nn.csv').sort_values(['id']).reset_index(drop=True)
p3['generated'] = rankdata(p3['generated'])/number_sample

p4=pd.read_csv('./sub_nn2.csv').sort_values(['id']).reset_index(drop=True)
p4['generated'] = rankdata(p4['generated'])/number_sample


p1['generated'] = p1['generated']*0.7+p3['generated']*0.2+p4['generated']*0.1
p1[['id', 'generated']].to_csv('sub_stage1.csv', index=False)

### 二阶段的伪标签线性分类器

In [None]:
%%writefile lin_infer_stage2.py
import sys
import gc
from scipy.sparse import vstack
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
import glob
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast
from tokenizers.normalizers import (Sequence, Lowercase, NFD, 
                                   StripAccents)

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
train = train[~train['source'].isin(['mistral7binstruct_v2', 'mistral7binstruct_v1','falcon_180b_v1','llama2_chat', 'llama_70b_v1', 'NousResearch/Llama-2-7b-chat-hf'])].reset_index(drop=True)
extra = pd.read_csv('/kaggle/input/argugpt/argugpt.csv')
extra['label'] = 1
extra = extra[['text', 'label']]

extra2 = pd.read_csv('/kaggle/input/llm-detect-sim-filter-daigt-v3/llama_filter.csv')
extra3 = pd.read_csv('/kaggle/input/llm-detect-sim-filter-daigt-v3/falcon_filter.csv')

extra5 = pd.read_csv('/kaggle/input/llm-detect-sim-filter-essayfroum/essayforum_writingt_filter.csv')

extra6 = pd.read_csv('/kaggle/input/llm-detect-sim-filter-daigt-v3/mistral_filter.csv')

extra7 = pd.read_csv('/kaggle/input/llm-detect-sim-filter-nahedabdelgaber/evaluating-student-writing_filter.csv')
extra7 = extra7.sample(8000, random_state=100).reset_index(drop=True)
extra7['label'] = 1
extra7 = extra7[['text', 'label']]

train = pd.concat([train, extra2, extra3, extra5, extra6, extra7], ignore_index=True)
#if len(test)==3:
#    train = train.sample(1000, random_state=100).reset_index(drop=True)
train['text'] = train['text'].str.strip().replace('\n', '')
test['text'] = test['text'].str.strip().replace('\n', '')

train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

train['label'].value_counts()

VALID_MODE = len(test)==3

if VALID_MODE:
    valid = pd.read_csv('/kaggle/input/llm-detect-ai-validation2/nonTargetText_llm_slightly_modified_gen.csv')
    valid = valid.dropna().reset_index(drop=True)
    valid['text'] = valid['text'].str.strip().replace('\n', '')
    print(valid.shape)
else:
    valid = pd.read_csv('/kaggle/input/llm-detect-ai-validation2/nonTargetText_llm_slightly_modified_gen.csv')
    valid = valid.dropna().reset_index(drop=True)
    valid['text'] = valid['text'].str.strip().replace('\n', '')
    print(valid.shape)
    #train = pd.concat([train, valid]).drop_duplicates(subset=['text']).reset_index(drop=True, inplace=False)

LOWERCASE = False
VOCAB_SIZE = 3500 

# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]",))


# Adding normalization and pre_tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([NFD(), 
])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adding special tokens and creating trainer instance
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens, )



# Creating huggingface dataset object
if VALID_MODE:
    dataset = Dataset.from_pandas(valid[['text']])
else:
    dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Training from iterator REMEMBER it's training on test set...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
raw_tokenizer.model.save('.')

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)


tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text, add_special_tokens=False))


# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text, add_special_tokens=False))

if VALID_MODE: 
    tokenized_texts_valid = []

    for text in tqdm(valid['text'].tolist()):
        tokenized_texts_valid.append(tokenizer.tokenize(text, add_special_tokens=False))
    tokenized_texts_valid_aug = []
    for text in tokenized_texts_valid:
        tokenized_texts_valid_aug.append(text+text[::2]+text[1::2])

def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

tokenized_texts_test_aug = []
for text in tokenized_texts_test:
    tokenized_texts_test_aug.append(text+text[::2]+text[1::2])

tokenized_texts_train_aug = []
for text in tokenized_texts_train:
    tokenized_texts_train_aug.append(text+text[::2]+text[1::2])

# Fitting TfidfVectoizer on test set
min_df = 2
vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             lowercase=False, 
                             sublinear_tf=True, 
                             analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            min_df = 2,
                            token_pattern = None, 
                            strip_accents='unicode'
                                                        )
if VALID_MODE: 
    vectorizer.fit(tokenized_texts_valid_aug)
else:
    vectorizer.fit(tokenized_texts_test_aug)

# Getting vocab
vocab = vectorizer.vocabulary_
# Here we fit our vectorizer on train set but this time we use vocabulary from test fit.
vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             lowercase=False, 
                             sublinear_tf=True, 
                             vocabulary=vocab,
                             min_df = 2,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train_aug)
tf_test = vectorizer.transform(tokenized_texts_test_aug)

y_train = train['label'].values

bayes_model = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=35000, tol=1e-4, loss="modified_huber")

weights = [0.1,0.9,]
 
ensemble = VotingClassifier(estimators=[('mnb',bayes_model),
                                        ('sgd', sgd_model),
                                       ],
                            weights=weights, voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)
gc.collect()
stage1 = pd.read_csv('sub_stage1.csv')
final_preds = stage1['generated'].values

#sub['generated'] = final_preds
#sub.to_csv('sub_linear.csv', index=False)
#sub

if VALID_MODE:
    tf_valid = vectorizer.transform(tokenized_texts_valid_aug)
    y_pred = ensemble.predict_proba(tf_valid)[:,1]
    from sklearn.metrics import roc_auc_score
    print("* valid AUC-ROC score:",roc_auc_score(valid["label"], y_pred))
    low_rank_index = rankdata(y_pred)<0.1*len(y_pred)
    high_rank_index = rankdata(y_pred)>0.9*len(y_pred)
    tf_train = vstack([tf_train, tf_valid[low_rank_index], tf_valid[high_rank_index]])
    y_train = np.concatenate([y_train, sum(low_rank_index)*[0], sum(high_rank_index)*[1]])
    bayes_model = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=25000, tol=1e-4, loss="modified_huber")

    weights = [0.5,0.5,]

    ensemble2 = VotingClassifier(estimators=[('mnb',bayes_model),
                                            ('sgd', sgd_model),
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble2.fit(tf_train, y_train)
    y_pred = ensemble2.predict_proba(tf_valid)[:,1]
    print("* pl valid AUC-ROC score:",roc_auc_score(valid["label"], y_pred))
    final_preds2 = ensemble2.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds2
    sub.to_csv('sub_linear_stage2.csv', index=False)
    sub
else:
    low_rank_index = rankdata(final_preds)<0.15*len(final_preds)
    high_rank_index = rankdata(final_preds)>0.85*len(final_preds)
    
    tf_train = vstack([tf_train, tf_test[low_rank_index], tf_test[high_rank_index]])
    y_train = np.concatenate([y_train, sum(low_rank_index)*[0], sum(high_rank_index)*[1]])
    bayes_model = MultinomialNB(alpha=0.022)
    sgd_model = SGDClassifier(max_iter=33000, tol=1e-4, loss="modified_huber")

    weights = [0.1,0.9,]

    ensemble2 = VotingClassifier(estimators=[('mnb',bayes_model),
                                            ('sgd', sgd_model),
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble2.fit(tf_train, y_train)
    final_preds2 = ensemble2.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds2
    sub.to_csv('sub_linear_stage2.csv', index=False)
    sub

In [None]:
!python lin_infer_stage2.py

### 二阶段融合

In [None]:
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy.stats import rankdata
p1=pd.read_csv('./sub_linear_stage2.csv').sort_values(['id']).reset_index(drop=True)
number_sample = p1.shape[0]
p1['generated'] = rankdata(p1['generated'])/number_sample

p3=pd.read_csv('./sub_nn.csv').sort_values(['id']).reset_index(drop=True)
p3['generated'] = rankdata(p3['generated'])/number_sample

p4=pd.read_csv('./sub_nn2.csv').sort_values(['id']).reset_index(drop=True)
p4['generated'] = rankdata(p4['generated'])/number_sample


p1['generated'] = p1['generated']*0.7+p3['generated']*0.2+p4['generated']*0.1
p1[['id', 'generated']].to_csv('submission.csv', index=False)