## CommonLit Readability Challenge

### Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.svm import SVR


### Loading the training dataset

In [None]:
read_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
read_df['target'].describe()

In [None]:
count = lambda l1,l2: sum([1 for x in l1 if x in l2])
def count_punct(text):
    return count(text,set(string.punctuation))

In [None]:
def capitalWordCount(text):
    return sum(map(str.isupper, text.split()))

In [None]:
def nounCount(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = nltk.word_tokenize(text)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    return len(nouns)

In [None]:
read_df['noun_count'] = read_df['excerpt'].apply(nounCount)

In [None]:
read_df['capital_word_count'] = read_df['excerpt'].apply(capitalWordCount)

In [None]:
read_df['punct_count'] = read_df['excerpt'].apply(count_punct)

In [None]:
scaler = MinMaxScaler()

In [None]:
read_df

### Correlation between punct count and target

In [None]:
read_df[['punct_count','target']].corr()

### Attributes

In [None]:
import re
def remove_links(text):
    wnl=WordNetLemmatizer()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text=' '.join([wnl.lemmatize(i) for i in text.lower().split()])
    
    return text

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
read_df['excerpt'] = read_df['excerpt'].apply(remove_links)

In [None]:
read_df['excerpt_length'] = read_df['excerpt'].apply(len)

In [None]:
read_df[['excerpt_length','punct_count','capital_word_count','noun_count']] = scaler.fit_transform(read_df[['excerpt_length','punct_count','capital_word_count','noun_count']])

## Correlation between target column and standard error

In [None]:
sns.heatmap(read_df[['target','standard_error']].corr())

In [None]:
read_df[['target','standard_error']].corr()

In [None]:
read_df[['target','excerpt_length']].corr()

In [None]:
read_df[['target','punct_count']].corr()

In [None]:
read_df[['target','capital_word_count']].corr()

In [None]:
read_df[['target','noun_count']].corr()

In [None]:
read_df

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
transformed_text = vectorizer.fit_transform(read_df['excerpt'])
len(vectorizer.get_feature_names())

In [None]:
def createNewDF(transformed_text,vectorizer,read_df):
    features = list(vectorizer.get_feature_names())
    reading_df = pd.DataFrame(transformed_text.toarray(),columns=features)
    reading_df['excerpt_length']=read_df['excerpt_length']
    reading_df['punct_count']=read_df['punct_count']
    reading_df['noun_count']=read_df['noun_count']
    return reading_df

In [None]:
reading_df = createNewDF(transformed_text,vectorizer,read_df)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(reading_df,read_df['target'],random_state=42)

In [None]:
lr = LinearRegression()
rf = RandomForestRegressor()
rd = Ridge(alpha=1.0)
ls = Lasso(alpha=10.0)
svm = SVR(kernel='rbf')

In [None]:
lr.fit(X_train,y_train)
rf.fit(X_train,y_train)
rd.fit(X_train,y_train)
ls.fit(X_train,y_train)
svm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import mean_squared_error
pred = svm.predict(X_test)
mean_squared_error(pred,y_test)

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
test_df['punct_count'] = test_df['excerpt'].apply(count_punct)

In [None]:
test_df['capital_word_count'] = test_df['excerpt'].apply(count_punct)

In [None]:
test_df['noun_count'] = test_df['excerpt'].apply(nounCount)

In [None]:
test_df['excerpt'] = test_df['excerpt'].apply(remove_links)

In [None]:
test_df['excerpt_length'] = test_df['excerpt'].apply(len)

In [None]:
test_df[['excerpt_length','punct_count','capital_word_count','noun_count']] = scaler.transform(test_df[['excerpt_length','punct_count','capital_word_count','noun_count']])

In [None]:
ex_submission = vectorizer.transform(test_df['excerpt'])
submission_df = createNewDF(ex_submission,vectorizer,test_df)

In [None]:
submission_df

In [None]:
preds = lr.predict(submission_df)
pred_rf = rf.predict(submission_df)
pred_rd = rd.predict(submission_df)
pred_svr = svm.predict(submission_df)

In [None]:
submission = pd.DataFrame({'id':test_df['id'],'target':pred_svr})

In [None]:
submission

In [None]:
submission.to_csv("submission.csv",index=False)