In [1]:
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
display(df.head())
display(test_df.head())

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [3]:
df = df.drop('essay_id', axis=1)

In [4]:
test_df = test_df.drop('essay_id', axis=1)

In [5]:
le = LabelEncoder()

df['score'] = le.fit_transform(df['score'])

In [6]:
train_split, val_split = train_test_split(df, test_size=0.2, random_state=52, stratify=df['score'])

In [7]:
train_pool = Pool(
    train_split.drop('score', axis=1),
    train_split['score'],
    text_features=['full_text'],
    feature_names=list(train_split.drop('score', axis=1))
)
val_pool = Pool(
    val_split.drop('score', axis=1),
    val_split['score'],
    text_features=['full_text'],
    feature_names=list(val_split.drop('score', axis=1))
)

In [8]:
model = CatBoostClassifier(iterations=300,
                           depth=5,
                           eval_metric='WKappa',
                           early_stopping_rounds=20,
                           use_best_model=True,
                           random_seed=33)

model.fit(train_pool, eval_set=val_pool)

Learning rate set to 0.180258
0:	learn: 0.5335978	test: 0.5293200	best: 0.5293200 (0)	total: 649ms	remaining: 3m 13s
1:	learn: 0.5399308	test: 0.5495652	best: 0.5495652 (1)	total: 1.18s	remaining: 2m 56s
2:	learn: 0.5557569	test: 0.5650717	best: 0.5650717 (2)	total: 1.76s	remaining: 2m 54s
3:	learn: 0.5674989	test: 0.5722259	best: 0.5722259 (3)	total: 2.33s	remaining: 2m 52s
4:	learn: 0.5723367	test: 0.5832950	best: 0.5832950 (4)	total: 2.88s	remaining: 2m 49s
5:	learn: 0.5823836	test: 0.5895870	best: 0.5895870 (5)	total: 3.41s	remaining: 2m 47s
6:	learn: 0.5909267	test: 0.6005987	best: 0.6005987 (6)	total: 3.93s	remaining: 2m 44s
7:	learn: 0.5957941	test: 0.6039449	best: 0.6039449 (7)	total: 4.49s	remaining: 2m 43s
8:	learn: 0.6024900	test: 0.6084067	best: 0.6084067 (8)	total: 5.04s	remaining: 2m 43s
9:	learn: 0.6080443	test: 0.6118047	best: 0.6118047 (9)	total: 5.6s	remaining: 2m 42s
10:	learn: 0.6179245	test: 0.6234053	best: 0.6234053 (10)	total: 6.21s	remaining: 2m 43s
11:	learn: 0

<catboost.core.CatBoostClassifier at 0x7aa21174fa60>

In [9]:
test_pool = Pool(
    data=test_df,
    text_features=['full_text']
)
preds_test = model.predict(test_pool)

In [10]:
submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission.head()

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4


In [11]:
submission["score"] = preds_test
submission["score"] = le.inverse_transform(submission["score"])

In [12]:
submission.to_csv("submission.csv", index=False)