In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv
/kaggle/input/readabilitypackage/readability-package/LICENSE.txt
/kaggle/input/readabilitypackage/readability-package/setup.py
/kaggle/input/readabilitypackage/readability-package/readability/langdata.py
/kaggle/input/readabilitypackage/readability-package/readability/__init__.py
/kaggle/input/readabilitypackage/readability-package/bin/readability


In [2]:
import pandas as pd
import numpy as np
import os
import random

import matplotlib.pyplot as plt
import seaborn as sns

# import nltk
# from nltk.corpus import stopwords 
# from nltk.tokenize import word_tokenize 
# from nltk import pos_tag, pos_tag_sents
# import string

#!pip install readability
# import sys
# sys.path.append("../input/readabilitypackage/readability-package")
# import readability
# import spacy

import warnings
warnings.filterwarnings('ignore')

## Define feature functions

In [3]:
import sys
sys.path.append("../input/readabilitypackage/readability-package")

In [4]:
%%writefile textfeat.py

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
import string

import readability
import spacy

import numpy as np
import pandas as pd


def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    kincaid = results['readability grades']['Kincaid']
    ari = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch = results['readability grades']['FleschReadingEase']
    gunning_fog = results['readability grades']['GunningFogIndex']
    lix = results['readability grades']['LIX']
    smog = results['readability grades']['SMOGIndex']
    rix = results['readability grades']['RIX']
    dale_chall = results['readability grades']['DaleChallIndex']
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]


def spacy_features(df: pd.DataFrame):
    """
    This function generates features using spacy en_core_wb_lg
    I learned about this from these resources:
    https://www.kaggle.com/konradb/linear-baseline-with-cv
    https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
    """
    
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors


def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names


def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list


def generate_other_features(passage: str):
    """
    This function is where I test miscellaneous features
    This is experimental
    """
    # punctuation count
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = len(passage.split(" "))
    unique_words = len(set(passage.split(" ") ))
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len)
    avg_len_word = np.mean(word_len)
    
    return [periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]


def create_text_feat(df: pd.DataFrame):
    scores_df = pd.DataFrame(
        df["excerpt"].apply(lambda p : readability_measurements(p)).tolist(), 
        columns=[
            "chars_per_word", "syll_per_word", "words_per_sent",
            "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
            "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
            "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"
        ]
    )

    spacy_df = pd.DataFrame(spacy_features(df), columns=get_spacy_col_names())

    pos_df = pd.DataFrame(
        df["excerpt"].apply(lambda p : pos_tag_features(p)).tolist(),
        columns=[
            "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
            "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
            "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"
        ]
    )
    
    other_df = pd.DataFrame(
        df["excerpt"].apply(lambda p : generate_other_features(p)).tolist(),
        columns=[
            "periods", "commas", "semis", "exclaims", "questions",
            "num_char", "num_words", "unique_words", "word_diversity",
            "longest_word", "avg_len_word"
        ]
    )
    
    dst_df = pd.concat([scores_df, spacy_df, pos_df, other_df], axis=1).reset_index(drop=True)
    return dst_df

Writing textfeat.py


In [5]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv", nrows=50)
train.head()

test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [6]:
%load_ext autoreload
%autoreload 2

from textfeat import create_text_feat

In [7]:
results = create_text_feat(train)

In [8]:
results.head()

Unnamed: 0,chars_per_word,syll_per_word,words_per_sent,kincaid,ari,coleman_liau,flesch,gunning_fog,lix,smog,...,commas,semis,exclaims,questions,num_char,num_words,unique_words,word_diversity,longest_word,avg_len_word
0,4.407821,1.234637,29.833333,10.613715,14.247505,9.124776,72.103887,15.732216,47.710428,12.219544,...,14,0,0,0,992,174,112,0.643678,14,4.706897
1,4.145349,1.197674,28.666667,9.722558,12.427926,7.541115,76.415078,14.257364,43.782946,10.745967,...,24,0,5,2,937,164,123,0.75,15,4.719512
2,4.105882,1.211765,34.0,11.968824,14.908706,7.470958,69.809706,16.894118,49.294118,12.165151,...,17,2,1,0,908,162,124,0.765432,14,4.611111
3,4.365854,1.189024,82.0,30.420488,40.133171,9.508945,23.013537,33.77561,97.243902,10.745967,...,23,2,0,0,909,163,117,0.717791,13,4.582822
4,3.734694,1.027211,147.0,53.861088,69.660408,5.957366,-29.272041,59.888435,151.081633,13.954451,...,13,10,0,0,723,147,51,0.346939,12,3.92517


In [9]:
!cp -r ../input/readabilitypackage/readability-package ./