In [1]:
####### PACKAGES

import numpy as np
import pandas as pd

import os
import sys
import time

from tqdm import tqdm

!pip install '../input/textstat/Pyphen-0.10.0-py3-none-any.whl'
!pip install '../input/textstat/textstat-0.7.0-py3-none-any.whl'
sys.path = ['../input/readability-package'] + sys.path

import readability
import spacy
from textstat import textstat

import re
import en_core_web_sm
import string
import nltk
from nltk.corpus import stopwords

Processing /kaggle/input/textstat/Pyphen-0.10.0-py3-none-any.whl
Installing collected packages: Pyphen
Successfully installed Pyphen-0.10.0
Processing /kaggle/input/textstat/textstat-0.7.0-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.0


In [2]:
####### HELPER FUNCTIONS

nlp         = en_core_web_sm.load()
STOPWORDS   = stopwords.words("english")
PUNCTUATION = list(string.punctuation)
POS_TAGS    = ["ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","VERB","X","SPACE"]


def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    complex_words  = results['sentence info']['complex_words']
    long_words     = results['sentence info']['long_words']
    
    kincaid      = results['readability grades']['Kincaid']
    ari          = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch       = results['readability grades']['FleschReadingEase']
    gunning_fog  = results['readability grades']['GunningFogIndex']
    lix          = results['readability grades']['LIX']
    smog         = results['readability grades']['SMOGIndex']
    rix          = results['readability grades']['RIX']
    dale_chall   = results['readability grades']['DaleChallIndex']
    
    tobeverb       = results['word usage']['tobeverb']
    auxverb        = results['word usage']['auxverb']
    conjunction    = results['word usage']['conjunction']
    pronoun        = results['word usage']['pronoun']
    preposition    = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b     = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article       = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [complex_words, long_words,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]




def simplify_punctuation(text):
    # from https://github.com/shivam5992/textstat/issues/77
    text = re.sub(r"[,:;()\-]", " ", text)  # Override commas, colons, etc to spaces/
    text = re.sub(r"[\.!?]", ".", text)  # Change all terminators like ! and ? to "."
    text = re.sub(r"^\s+", "", text)  # Remove white space
    text = re.sub(r"[ ]*(\n|\r\n|\r)[ ]*", " ", text)  # Remove new lines
    text = re.sub(r"([\.])[\. ]+", ".", text)  # Change all ".." to "."
    text = re.sub(r"[ ]*([\.])", ". ", text)  # Normalize all "."`
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = re.sub(r"\s+$", "", text)  # Remove trailing spaces
    return text


def get_mean_parse_tree_depth(text):
    sentences = text.split(".")
    depths = []
    for doc in list(nlp.pipe(sentences)):
        depths += get_parse_tree_depths(doc)
    return np.mean(depths)


def get_parse_tree_depths(doc):
    return [get_depth(token) for token in doc]


def get_depth(token, depth=0):
    depths = [get_depth(child, depth + 1) for child in token.children]
    return max(depths) if len(depths) > 0 else depth


def get_mean_pos_tags(text):
    sentences       = text.split(".")
    sentence_counts = make_pos_tag_count_lists(sentences)
    num_sentences   = textstat.sentence_count(text)
    mean_pos_tags   = calculate_mean_per_tag(sentence_counts, num_sentences)
    return mean_pos_tags


def make_pos_tag_count_lists(sentences):
    sentence_counts = {}
    for doc in list(nlp.pipe(sentences)):
        pos_counts = get_pos_tag_counts(doc)
        for key in pos_counts:
            if key in sentence_counts:
                sentence_counts[key].append(pos_counts[key])
            else:
                sentence_counts[key] = [pos_counts[key]]
    return sentence_counts


def get_pos_tag_counts(doc):
    pos_counts = {}
    pos_tags = [token.pos_ for token in doc]
    for tag in pos_tags:
        if tag in pos_counts:
            pos_counts[tag] += 1
        else:
            pos_counts[tag] = 1
    return pos_counts


def calculate_mean_per_tag(counts, num_sentences):
    mean_pos_tags = {f"mean_{tag.lower()}": 0 for tag in POS_TAGS}
    for key in counts:
        if len(counts[key]) < num_sentences:
            counts[key] += [0] * (num_sentences - len(counts[key]))
        mean_value = round(np.mean(counts[key]), 2)
        mean_pos_tags["mean_" + key.lower()] = mean_value
    return mean_pos_tags


def get_total_ents(text):
    return len(nlp(text).doc.ents)


def get_mean_nonstop_char_length_word_count(text):
    spans = tokenize_on_stopwords(text)
    return sum([get_num_chars(span) for span in spans]) / len(spans),  sum([get_num_words(span) for span in spans]) / len(spans)


def get_nonstop_proportion(text):
    tokens = nltk.word_tokenize(text)
    nonstop_tokens = [token for token in tokens if token not in STOPWORDS + PUNCTUATION]
    return len(nonstop_tokens) / len(tokens)


def tokenize_on_stopwords(text):
    tokens = nltk.word_tokenize(text)
    spans = []
    current_span = []
    for token in tokens:
        if token not in STOPWORDS + PUNCTUATION:
            current_span.append(token)
        else:
            if len(current_span) > 0:
                spans.append(" ".join(current_span))
            current_span = []
    return spans


def get_num_chars(text):
    return len(text)


def get_num_words(text):
    return len(text.split())


def get_num_unique_words(text):
     return len(set(w for w in text.split()))


def get_num_sentences(text):
    total = text.count(".") + text.count("?") + text.count("!")
    if total == 0:
        return 1
    else:
        return total
    
    
def get_num_semicolons(text):
    total = text.count(";")
    return total


def get_num_quotes(text):
    total = text.count('"')
    return total


def get_num_punctuation(text):
    total = sum(text.count(w) for w in '.,;:!?"')
    return total

In [3]:
####### FEATURE GENERATION

def gen_features(text):
    
    '''Compute text features'''
        
    simplified_text = simplify_punctuation(text)

    features = {
        "flesch_reading_ease":          textstat.flesch_reading_ease(simplified_text),
        "smog_index":                   textstat.smog_index(simplified_text),
        "flesch_kincaid_grade":         textstat.flesch_kincaid_grade(simplified_text),
        "coleman_liau_index":           textstat.coleman_liau_index(simplified_text),
        "automated_readability_index":  textstat.automated_readability_index(simplified_text),
        "dale_chall_readability_score": textstat.dale_chall_readability_score(simplified_text),
        "difficult_words":              textstat.difficult_words(simplified_text),
        "linsear_write_formula":        textstat.linsear_write_formula(simplified_text),
        "gunning_fog2":                 textstat.gunning_fog(simplified_text),
        "text_standard":                textstat.text_standard(simplified_text, float_output = True),
        "mean_parse_tree_depth":        get_mean_parse_tree_depth(text),
        "total_sentences":              get_num_sentences(text),
        "total_words":                  get_num_words(text),
        "total_ents":                   get_total_ents(text),
        "total_chars":                  get_num_chars(text),
        "total_punctutation":           get_num_punctuation(text),
        "unique_words":                 get_num_words(text),
        "nonstop_token_proportion":     get_nonstop_proportion(text),
        "semicolons":                   get_num_semicolons(text),
        "quotes":                       get_num_quotes(text),
    }
    
    nonstops = get_mean_nonstop_char_length_word_count(text)
    features['nonstop_char_count'] = nonstops[0]
    features['nonstop_word_count'] = nonstops[1]

    features['words_per_sentence']    = features['total_words']        / features['total_sentences']
    features['ents_per_sentence']     = features['total_ents']         / features['total_sentences']
    features['chars_per_sentence']    = features['total_chars']        / features['total_sentences']
    features['chars_per_word']        = features['total_chars']        / features['total_words']
    features['punctutation_per_word'] = features['total_punctutation'] / features['total_sentences']

    features.update(get_mean_pos_tags(text))
        
    return features


def add_features(data):
    
    feature_data = []

    for text in tqdm(data):
        features = gen_features(text)
        feature_data.append(features)

    return pd.DataFrame(feature_data)


def TF_IDF_W2V(text):
    '''Calculate TF-IDF with word2vec
    '''
    #Load TF-IDF from sklearn
    TFIDF_model = TfidfVectorizer()
    #fit on text
    TFIDF_model.fit(text)
    #create dictionary with word as key
    #and idf as value
    dictionary = dict(zip(TFIDF_model.get_feature_names(), list(TFIDF_model.idf_)))
    #apply set as we need unique features
    TFIDF_words = set(TFIDF_model.get_feature_names())
    #create list which stores TFIDF_W2V
    TFIDF_W2V_vectors = []
    for sentence in text:
        #create empty vector to store result
        vector = np.zeros(300)
        #number of words with valid vector in sentence
        TFIDF_weight =0
        for word in sentence.split(): 
            #if word exist in glove_words and TFIDF_words
            if (word in glove_words) and (word in TFIDF_words):
                #get its vector from glove_words
                vec = word2vec_model[word]
                #calculate TF-IDF for each word
                TFIDF = dictionary[word]*(sentence.count(word)/len(sentence.split()))
                #calculate TF-IDF weighted W2V
                vector += (vec * TFIDF)
                TFIDF_weight += TFIDF
                
        if TFIDF_weight != 0:
            vector /= TFIDF_weight
        TFIDF_W2V_vectors.append(vector)
    return TFIDF_W2V_vectors 


def clean_text(text):
    text = str(text).lower()
    text = re.sub('\n', '', text)
    text = "".join([i for i in text if i not in string.punctuation])
    return text


def tf_idf_features(df: pd.DataFrame):

    text    = df['excerpt'].apply(lambda x: clean_text(x))
    tfidf   = TF_IDF_W2V(text)
    vectors = np.array(tfidf)
        
    return vectors


def get_tf_idf_col_names():
    names = list()
    for i in range(300):
        names.append(f"tf_idf_{i}")
        
    return names

In [4]:
####### DATA IMPORT

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [5]:
####### COMPUTING FEATURES [1]

train_features = add_features(train.excerpt.to_list())
train_features = pd.DataFrame(train_features)

100%|██████████| 2834/2834 [18:17<00:00,  2.58it/s]


In [6]:
###### COMPUTING FEATURES [2]

train_features2 = pd.DataFrame(train.excerpt.apply(lambda p : readability_measurements(p)).tolist(), 
                               columns = ["complex_words","long_words",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b",])

In [7]:
##### SAVE FEATURES

df_train = pd.concat([train, train_features, train_features2], axis = 1)
df_train.to_csv('features_train.csv', index = False)
print(df_train.shape)
df_train.head()

(2834, 74)


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,...,conjunction,pronoun,preposition,nominalization,pronoun_b,interrogative,article,subordination,conjunction_b,preposition_b
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,80.11,8.6,6.2,7.65,...,11,8,23,1,2,1,2,0,0,1
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,71.24,8.8,7.5,6.9,...,7,30,22,0,0,0,0,0,0,1
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,75.54,10.1,7.9,6.73,...,11,24,18,0,0,0,0,1,1,1
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,71.41,6.7,11.6,8.08,...,15,12,26,0,0,0,0,0,1,0
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,83.12,5.7,9.2,4.25,...,10,6,10,0,0,0,0,0,0,0
