In [1]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re 
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.exceptions import FitFailedWarning
from sklearn.model_selection import cross_val_score
import warnings
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
ps = PorterStemmer()

In [3]:
train_df.drop(['url_legal', 'license'], axis=1, inplace=True)
test_df.drop(['url_legal', 'license'], axis=1, inplace=True)

In [4]:
def cleanup(text):
    clean_text = re.sub('[^a-zA-Z]', ' ', text)
    clean_text = clean_text.lower()
    clean_text = clean_text.split()
    clean_text = (ps.stem(word) for word in clean_text if not word in stopwords.words('english'))   
    clean_text = ' '.join(clean_text)
    return clean_text

In [5]:
combined = pd.concat([train_df, test_df])
combined

Unnamed: 0,id,excerpt,target,standard_error
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...
2,0df072751,It was a bright and cheerful scene that greete...,,
3,04caf4e0c,Cell division is the process by which a parent...,,
4,0e63f8bea,Debugging is the process of finding and resolv...,,
5,12537fe78,"To explain transitivity, let us look first at ...",,


In [6]:
combined['excerpt'] = combined['excerpt'].apply(lambda x: cleanup(x))

In [7]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(combined['excerpt']).toarray()
X_train = X[0:len(train_df)]
X_test = X[len(train_df):]

In [8]:
y_train = train_df['target']

In [9]:
params = {
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
    'n_estimators': [100, 200, 300,400, 500, 600]
}

In [10]:
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=6,
             min_child_weight=7, monotone_constraints='()',
             n_estimators=600, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
random_search = RandomizedSearchCV(reg, param_distributions=params, n_iter=5, scoring='roc_auc')

In [12]:
with warnings.catch_warnings(record=True) as w:
    try:
        random_search.fit(X_train, y_train) 
    except ValueError:
        pass
    print(repr(w[-1].message))



In [13]:
random_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0.2, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=9,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
score = cross_val_score(reg, X_train, y_train)
score

array([0.19685044, 0.06439059, 0.19178856, 0.47074367, 0.28955276])

In [15]:
reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=6,
             min_child_weight=7, missing=nan, monotone_constraints='()',
             n_estimators=600, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
pred = reg.predict(X_test)
pred

array([-1.1235982 , -0.9508646 , -0.86197066, -1.2228249 , -1.8451799 ,
       -0.9843696 , -0.08290811], dtype=float32)

In [17]:
sub = pd.DataFrame(columns=['id', 'target'])
sub['id'] = test_df['id']
sub['target'] = pred
sub.to_csv('submission.csv', index=False)