In [1]:
#データセット取得
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv


In [2]:
#モデルインポート
import pandas as pd
import numpy as np
import re
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

In [3]:
#トレーニングデータ・テストデータの準備
train=pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test=pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

In [4]:
#データ前処理
def data_cleaner(text):
    text = text.strip()
    text = re.sub(r'\n', '', text)
    text = text.lower()
    return text

In [5]:
#'full_text'列のデータ前処理実行
train['full_text']=train['full_text'].apply(data_cleaner)
test['full_text']=test['full_text'].apply(data_cleaner)

In [6]:
#感情分析で特徴量を作成
import nltk
from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def generate_sentiment_scores(data):
    sid = SentimentIntensityAnalyzer()
    neg=[]
    pos=[]
    neu=[]
    comp=[]
    for sentence in tqdm(data['full_text'].values): 
        sentence_sentiment_score = sid.polarity_scores(sentence)
        comp.append(sentence_sentiment_score['compound'])
        neg.append(sentence_sentiment_score['neg'])
        pos.append(sentence_sentiment_score['pos'])
        neu.append(sentence_sentiment_score['neu'])
    return comp,neg,pos,neu
train['compound'],train['negative'],train['positive'],train['neutral']=generate_sentiment_scores(train)
test['compound'],test['negative'],test['positive'],test['neutral']=generate_sentiment_scores(test)

100%|██████████| 3911/3911 [00:18<00:00, 213.54it/s]
100%|██████████| 3/3 [00:00<00:00, 174.29it/s]


In [7]:
#'full_text'列の単語数である'com_len'列を作成
train['com_len']=train['full_text'].apply(lambda x:len(x.split()))
test['com_len']=test['full_text'].apply(lambda x:len(x.split()))

In [8]:
#'full_text'列をTfidfでベクトル化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['full_text'])
X_test = vectorizer.transform(test['full_text'])

In [9]:
%%time
#'compound'列の値の範囲を(-1,1)に変換
from sklearn.preprocessing import Normalizer
trans = Normalizer()
X_train_com=trans.fit_transform(train['compound'].values.reshape(-1,1))
X_test_com=trans.transform(test['compound'].values.reshape(-1,1))

CPU times: user 0 ns, sys: 2.67 ms, total: 2.67 ms
Wall time: 2.36 ms


In [10]:
%%time
#'negative'列の値の範囲を(-1,1)に変換
from sklearn.preprocessing import Normalizer
trans = Normalizer()
X_train_neg=trans.fit_transform(train['negative'].values.reshape(-1,1))
X_test_neg=trans.transform(test['negative'].values.reshape(-1,1))

CPU times: user 866 µs, sys: 0 ns, total: 866 µs
Wall time: 879 µs


In [11]:
%%time
#'positive'列の値の範囲を(-1,1)に変換
from sklearn.preprocessing import Normalizer
trans = Normalizer()
X_train_pos=trans.fit_transform(train['positive'].values.reshape(-1,1))
X_test_pos=trans.transform(test['positive'].values.reshape(-1,1))

CPU times: user 1.63 ms, sys: 0 ns, total: 1.63 ms
Wall time: 1.51 ms


In [12]:
%%time
#'neutral'列の値の範囲を(-1,1)に変換
from sklearn.preprocessing import Normalizer
trans = Normalizer()
X_train_neu=trans.fit_transform(train['neutral'].values.reshape(-1,1))
X_test_neu=trans.transform(test['neutral'].values.reshape(-1,1))

CPU times: user 2.44 ms, sys: 29 µs, total: 2.47 ms
Wall time: 2.38 ms


In [13]:
%%time
#'com_len'列の値の範囲を(-1,1)に変換
from sklearn.preprocessing import Normalizer
trans = Normalizer()
X_train_len=trans.fit_transform(train['com_len'].values.reshape(-1,1))
X_test_len=trans.transform(test['com_len'].values.reshape(-1,1))

CPU times: user 2.1 ms, sys: 0 ns, total: 2.1 ms
Wall time: 1.91 ms


In [14]:
%%time
#(-1,1)に変換した特徴量を水平方向に並べる
from scipy.sparse import hstack
train_s=hstack((X_train,X_train_com,X_train_neg,X_train_pos,X_train_neu,X_train_len))
test_s=hstack((X_test,X_test_com,X_test_neg,X_test_pos,X_test_neu,X_test_len))

CPU times: user 14.4 ms, sys: 8.05 ms, total: 22.5 ms
Wall time: 21.9 ms


### level0

In [15]:
#LightBGMのパラメータ
params_lgb = {
    "n_estimators": 1000,
    "verbose": -1
}

In [16]:
#目的変数
y_train=train[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']]

In [17]:
#LightBGMモデル作成
model = MultiOutputRegressor(LGBMRegressor(**params_lgb))
model.fit(train_s, y_train)

MultiOutputRegressor(estimator=LGBMRegressor(n_estimators=1000, verbose=-1))

In [18]:
#CatBoostのパラメータ
param = {'learning_rate': 0.3, 
          'depth': 12, 
          'l2_leaf_reg': 4, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 20,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bayesian', 
          'allow_const_label': True, 
          'random_state': 1
         }

In [19]:
#CatBoostモデル作成
model2 = CatBoostRegressor(**param)
model2.fit(train_s, y_train)

0:	learn: 1.5227211	total: 1m 17s	remaining: 24m 41s
1:	learn: 1.4675746	total: 2m 36s	remaining: 23m 26s
2:	learn: 1.4133314	total: 3m 54s	remaining: 22m 8s
3:	learn: 1.3742196	total: 5m 12s	remaining: 20m 50s
4:	learn: 1.3321844	total: 6m 30s	remaining: 19m 31s
5:	learn: 1.2941530	total: 7m 48s	remaining: 18m 13s
6:	learn: 1.2711735	total: 9m 5s	remaining: 16m 53s
7:	learn: 1.2429122	total: 10m 24s	remaining: 15m 36s
8:	learn: 1.2072600	total: 11m 42s	remaining: 14m 18s
9:	learn: 1.1784177	total: 12m 59s	remaining: 12m 59s
10:	learn: 1.1599020	total: 14m 17s	remaining: 11m 41s
11:	learn: 1.1426351	total: 15m 34s	remaining: 10m 22s
12:	learn: 1.1224001	total: 16m 52s	remaining: 9m 5s
13:	learn: 1.1028159	total: 18m 9s	remaining: 7m 46s
14:	learn: 1.0928683	total: 18m 38s	remaining: 6m 12s
15:	learn: 1.0764745	total: 19m 56s	remaining: 4m 59s
16:	learn: 1.0395173	total: 21m 13s	remaining: 3m 44s
17:	learn: 1.0253966	total: 22m 31s	remaining: 2m 30s
18:	learn: 1.0199482	total: 23m 48s	r

<catboost.core.CatBoostRegressor at 0x7f9c6ef5d6d0>

In [20]:
#Ridgeモデル作成
model3 = Ridge(copy_X=False)
model3.fit(train_s, y_train)

Ridge(copy_X=False)

In [21]:
#XGBoostモデル作成
xgb_estimator = xgb.XGBRegressor(
        n_estimators=500, random_state=0, 
        objective='reg:squarederror')
model4 = MultiOutputRegressor(xgb_estimator, n_jobs=2)
model4.fit(train_s, y_train)

MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None, max_bin=None,
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=None,
     

In [22]:
#トレーニングデータで各モデル予測
first_pred_1 = model.predict(train_s)
first_pred_2 = model2.predict(train_s)
first_pred_3 = model3.predict(train_s)
first_pred_4 = model4.predict(train_s)
#水平方向に結果を並べる
stack_pred = np.column_stack((first_pred_1,first_pred_2,first_pred_3,first_pred_4))



### level1

In [23]:
#メタモデル用のCatBoostパラメータ
params = {'learning_rate': 0.3, 
          'depth': 12, 
          'l2_leaf_reg': 4, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 20,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bayesian', 
          'allow_const_label': True, 
          'random_state': 1
         }

In [24]:
#CatBoostでメタモデルの作成
meta_model =  CatBoostRegressor(**params)
meta_model.fit(stack_pred, y_train)

0:	learn: 1.2759208	total: 1.49s	remaining: 28.3s
1:	learn: 1.0311473	total: 2.95s	remaining: 26.5s
2:	learn: 0.8468462	total: 4.45s	remaining: 25.2s
3:	learn: 0.7172320	total: 5.89s	remaining: 23.6s
4:	learn: 0.6172556	total: 7.35s	remaining: 22s
5:	learn: 0.5372492	total: 8.8s	remaining: 20.5s
6:	learn: 0.4779172	total: 10.3s	remaining: 19.1s
7:	learn: 0.4345750	total: 11.7s	remaining: 17.6s
8:	learn: 0.3993495	total: 13.2s	remaining: 16.1s
9:	learn: 0.3719801	total: 14.8s	remaining: 14.8s
10:	learn: 0.3524307	total: 16.4s	remaining: 13.4s
11:	learn: 0.3375105	total: 17.9s	remaining: 11.9s
12:	learn: 0.3220324	total: 19.4s	remaining: 10.4s
13:	learn: 0.3084164	total: 20.9s	remaining: 8.94s
14:	learn: 0.3001843	total: 22.3s	remaining: 7.44s
15:	learn: 0.2896271	total: 23.8s	remaining: 5.95s
16:	learn: 0.2800231	total: 25.3s	remaining: 4.46s
17:	learn: 0.2715976	total: 26.8s	remaining: 2.97s
18:	learn: 0.2639165	total: 28.2s	remaining: 1.48s
19:	learn: 0.2580220	total: 29.6s	remaining:

<catboost.core.CatBoostRegressor at 0x7f9c6e6509d0>

In [25]:
#テストデータで各モデル予測
pred_1 = model.predict(test_s)
pred_2 = model2.predict(test_s)
pred_3 = model3.predict(test_s)
pred_4 = model4.predict(test_s)
#水平方向に結果を並べる
test_stack_pred = np.column_stack((pred_1,pred_2,pred_3,pred_4))

In [26]:
#メタモデルでテストデータ予測
y_test=meta_model.predict(test_stack_pred)

In [27]:
#提出用サンプルデータ作成
sample = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

In [28]:
#スライスで提出用データに予測を書き込む
sample['cohesion']=y_test[:,0]
sample['syntax']=y_test[:,1]
sample['vocabulary']=y_test[:,2]
sample['phraseology']=y_test[:,3]
sample['grammar']=y_test[:,4]
sample['conventions']=y_test[:,5]

In [29]:
#'text_id'を提出用データに書き込む
sample['text_id']=test['text_id']

In [30]:
#提出用データでcsvを作成
sample.to_csv('submission.csv',index=False)