# Import dependencies

In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
np.random.seed(0)

# Read the dataset

In [3]:
train_df = pd.read_csv("../../Data/kaggle/commonlit-readability/train.csv")
test_df = pd.read_csv("../../Data/kaggle/commonlit-readability/test.csv")
train_df.head()
#test_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


# Inspect the data

In [4]:
train_df.nunique()

id                2834
url_legal          667
license             15
excerpt           2834
target            2834
standard_error    2834
dtype: int64

# Preprocessing

In [5]:
train_df = train_df.drop(columns=['id','url_legal','license'])
train_df.head()

Unnamed: 0,excerpt,target,standard_error
0,When the young people returned to the ballroom...,-0.340259,0.464009
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,And outside before the palace a great garden w...,-1.054013,0.450007
4,Once upon a time there were Three Bears who li...,0.247197,0.510845


## Convert text into lower case

In [6]:
excerpts = train_df["excerpt"]
target = train_df["target"]

excerpts.str.lower()
excerpts.head()

0    When the young people returned to the ballroom...
1    All through dinner time, Mrs. Fayre was somewh...
2    As Roger had predicted, the snow departed as q...
3    And outside before the palace a great garden w...
4    Once upon a time there were Three Bears who li...
Name: excerpt, dtype: object

## stemming

In [7]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
excerpts = excerpts.apply(ps.stem)
excerpts.head()

0    when the young people returned to the ballroom...
1    all through dinner time, mrs. fayre was somewh...
2    as roger had predicted, the snow departed as q...
3    and outside before the palace a great garden w...
4    once upon a time there were three bears who li...
Name: excerpt, dtype: object

## Lemmatization

In [8]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
excerpts = excerpts.apply(wnl.lemmatize)

## Removing stopwords

In [9]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
excerpts = excerpts.apply(lambda text: remove_stopwords(text))

## Train-test split

In [10]:
excerpts_train, excerpts_val, y_train, y_val = train_test_split(excerpts, target, test_size=0.2)
excerpts_train.head()

405     hippocrates (c. 460 – c. 370 bc) greek doctor ...
836     long ago, clever cat foolish dog. clever cat l...
57      cry long, however, brave could expected prince...
1775    produce electric current, needed lower suspend...
2525    mun bun disobedient little boy; daddy bunker s...
Name: excerpt, dtype: object

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(excerpts_train)

CountVectorizer()

In [12]:
X_train = vectorizer.transform(excerpts_train)
X_val = vectorizer.transform(excerpts_val)
#print(X_train)

# Training

## Random Forest

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np 

regressor = RandomForestRegressor(random_state=42,n_estimators=100,max_depth=100)
regressor.fit(X_train,y_train)

RandomForestRegressor(max_depth=100, random_state=42)

In [14]:

y_pred = regressor.predict(X_train)
error = np.sqrt(mean_squared_error(y_train,y_pred))
print("{:,.02f}".format(error))

0.32


# Evaluate 

In [15]:
train_pred = regressor.predict(X_train)
val_pred = regressor.predict(X_val)
rmse_train = np.sqrt(mean_squared_error(y_train,train_pred))
rmse_val = np.sqrt(mean_squared_error(y_val,val_pred))
print("RMSE Train: {}".format(rmse_train))
print("RMSE Val: {}".format(rmse_val))

RMSE Train: 0.3174112784946647
RMSE Val: 0.8325789183586854


# Working on Testing data

In [16]:
test_excerpts = test_df["excerpt"]

In [17]:
test_excerpts = test_excerpts.str.lower()
test_excerpts = test_excerpts.apply(ps.stem)
test_excerpts = test_excerpts.apply(wnl.lemmatize)
test_excerpts = test_excerpts.apply(lambda text: remove_stopwords(text))

In [18]:
X_test = vectorizer.transform(test_excerpts)

# Predict

In [19]:
test_preds = regressor.predict(X_test)

# Export to CSV

In [20]:
x_sub = test_df[["id"]].copy()
x_sub["target"] = test_preds
x_sub.to_csv('submission.csv', index = False)
x_sub

Unnamed: 0,id,target
0,c0f722661,-1.036358
1,f0953f0a5,-0.365061
2,0df072751,-0.289395
3,04caf4e0c,-1.636836
4,0e63f8bea,-1.435692
5,12537fe78,-0.85162
6,965e592c0,-0.999787
