# Generate Useful Linguistic Features & Conduct Experimentation to Find the Most Relevants

In [98]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tokenizers import (
    decoders,
    models,
    pre_tokenizers,
    normalizers,
    processors,
    trainers,
    Tokenizer
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

import gc

import spacy
from collections import Counter

import nltk 
#nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import textstat
from spellchecker import SpellChecker

from sentence_transformers import SentenceTransformer, models
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn import preprocessing

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, make_scorer

import torch

tqdm.pandas()

nlp = spacy.load("en_core_web_sm")




In [53]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kevinmg96/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Loading Essay Score Dataset

In [2]:
data_path =  "data/train.csv"
df = pd.read_csv(data_path)

data_path =  "data/test.csv"
df_test = pd.read_csv(data_path)

X = df.drop(columns=["score","essay_id"])
y = df["score"].astype(float) #- 1
test = df_test.drop(columns=["essay_id"])

# Linguistic Feature Engineering

## Textstat Features

In [3]:
def textstat_features(text):
    features = {}
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    features['smog_index'] = textstat.smog_index(text)
    features['coleman_liau_index'] = textstat.coleman_liau_index(text)
    features['automated_readability_index'] = textstat.automated_readability_index(text)
    features['dale_chall_readability_score'] = textstat.dale_chall_readability_score(text)
    features['difficult_words'] = textstat.difficult_words(text)
    features['linsear_write_formula'] = textstat.linsear_write_formula(text)
    features['gunning_fog'] = textstat.gunning_fog(text)
    features['text_standard'] = textstat.text_standard(text, float_output=True)
    features['spache_readability'] = textstat.spache_readability(text)
    features['mcalpine_eflaw'] = textstat.mcalpine_eflaw(text)
    features['reading_time'] = textstat.reading_time(text)
    features['syllable_count'] = textstat.syllable_count(text)
    features['lexicon_count'] = textstat.lexicon_count(text)
    features['monosyllabcount'] = textstat.monosyllabcount(text)

    return features

X['textstat_features'] = X['full_text'].apply(textstat_features)
X_textstat = pd.DataFrame(X['textstat_features'].tolist())

X_textstat.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,spache_readability,mcalpine_eflaw,reading_time,syllable_count,lexicon_count,monosyllabcount
0,57.98,14.7,11.7,8.19,18.3,8.74,60,13.0,17.33,9.0,7.28,54.5,31.97,634,498,404
1,87.55,5.4,6.8,4.99,6.2,6.31,24,6.714286,7.48,7.0,3.92,25.7,19.6,398,332,275
2,65.15,9.9,11.5,8.94,11.6,7.24,67,15.5,11.49,12.0,5.12,32.6,36.96,767,550,417
3,58.32,10.4,13.2,10.97,12.9,8.5,78,15.75,11.91,11.0,5.34,29.6,33.01,685,448,291
4,54.66,11.8,13.0,10.57,13.9,7.79,55,19.666667,12.64,13.0,5.61,35.7,26.71,562,373,241


In [4]:
X_textstat.describe()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,spache_readability,mcalpine_eflaw,reading_time,syllable_count,lexicon_count,monosyllabcount
count,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0,17307.0
mean,70.125112,8.592529,10.120847,8.307653,10.28121,7.548267,45.97163,10.243665,10.245321,9.228462,4.823467,29.08834,24.658999,505.493153,367.490148,268.557347
std,15.50972,5.258949,1.851764,1.797433,6.675273,1.057301,23.739238,4.990244,5.358692,4.519681,1.898329,19.052728,10.522834,216.412801,150.192568,108.352292
min,-628.88,1.3,0.0,1.19,0.4,1.24,3.0,2.6,3.67,0.0,2.37,10.4,8.24,175.0,150.0,83.0
25%,63.83,6.6,9.0,7.07,7.8,6.89,29.0,7.142857,8.24,7.0,4.11,22.5,16.63,341.0,253.0,186.0
50%,71.04,8.0,10.1,8.3,9.6,7.45,41.0,8.833333,9.62,9.0,4.61,26.4,22.93,469.0,344.0,251.0
75%,78.38,9.7,11.2,9.57,11.7,8.09,58.0,12.0,11.24,11.0,5.18,31.5,30.41,622.0,451.0,329.0
max,103.73,278.6,22.1,15.21,355.6,41.07,219.0,67.0,286.67,279.0,102.16,1043.0,99.36,2123.0,1656.0,1368.0


## Linguistic Features

In [4]:
def extract_linguistic_features(text):

    doc = nlp(text)
    features = {}

    # NER Features
    entity_counts = {"GPE": 0, "PERCENT": 0, "NORP": 0, "ORG": 0, "CARDINAL": 0, "MONEY": 0, "DATE": 0, 
                    "LOC": 0, "PERSON": 0, "QUANTITY": 0, "EVENT": 0, "ORDINAL": 0, "WORK_OF_ART": 0, 
                    "LAW": 0, "PRODUCT": 0, "TIME": 0, "FAC": 0, "LANGUAGE": 0}
    for entity in doc.ents:
        if entity.label_ in entity_counts:
            entity_counts[entity.label_] += 1
    features['NER_Features'] = entity_counts

    # POS Features
    pos_counts = {"ADJ": 0, "NOUN": 0, "VERB": 0, "SCONJ": 0, "PRON": 0, "PUNCT": 0, "DET": 0, "AUX": 0, 
                "PART": 0, "ADP": 0, "SPACE": 0, "CCONJ": 0, "PROPN": 0, "NUM": 0, "ADV": 0, 
                "SYM": 0, "INTJ": 0, "X": 0}
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1
    features['POS_Features'] = pos_counts

    # tag Features
    tags = {"RB": 0, "-RRB-": 0, "PRP$": 0, "JJ": 0, "TO": 0, "VBP": 0, "JJS": 0, "DT": 0, "''": 0, "UH": 0, "RBS": 0, "WRB": 0, ".": 0, 
        "HYPH": 0, "XX": 0, "``": 0, "SYM": 0, "VB": 0, "VBN": 0, "WP": 0, "CC": 0, "LS": 0, "POS": 0, "NN": 0, ",": 0, "NNPS": 0,
          "RP": 0, ":": 0, "$": 0, "PDT": 0, "VBZ": 0, "VBD": 0, "JJR": 0, "-LRB-": 0, "IN": 0, "RBR": 0, "WDT": 0, "EX": 0, "MD": 0,
            "_SP": 0, "NNP": 0, "CD": 0, "VBG": 0, "NNS": 0, "PRP": 0}
    
    for token in doc:
        if token.tag_ in tags:
            tags[token.tag_] += 1
    features['tag_Features'] = tags

    # tense features
    tenses = [i.morph.get("Tense") for i in doc]
    tenses = [i[0] for i in tenses if i]
    tense_counts = Counter(tenses)
    features['past_tense_ratio'] = tense_counts.get("Past", 0) / (tense_counts.get("Pres", 0) + tense_counts.get("Past", 0) + 1e-5)
    features['present_tense_ratio'] = tense_counts.get("Pres", 0) / (tense_counts.get("Pres", 0) + tense_counts.get("Past", 0) + 1e-5)
    
    
    # len features

    features['word_count'] = len(doc)
    features['sentence_count'] = len([sentence for sentence in doc.sents])
    features['words_per_sentence'] = features['word_count'] / features['sentence_count']
    features['std_words_per_sentence'] = np.std([len(sentence) for sentence in doc.sents])

    features['unique_words'] = len(set([token.text for token in doc]))
    features['lexical_diversity'] = features['unique_words'] / features['word_count']

    paragraph = text.split('\n\n')

    features['paragraph_count'] = len(paragraph)

    features['avg_chars_by_paragraph'] = np.mean([len(paragraph) for paragraph in paragraph])
    features['avg_words_by_paragraph'] = np.mean([len(nltk.word_tokenize(paragraph)) for paragraph in paragraph])
    features['avg_sentences_by_paragraph'] = np.mean([len(nltk.sent_tokenize(paragraph)) for paragraph in paragraph]) 

    # sentiment features
    analyzer = SentimentIntensityAnalyzer()
    sentences = nltk.sent_tokenize(text)

    compound_scores, negative_scores, positive_scores, neutral_scores = [], [], [], []
    for sentence in sentences:
        scores = analyzer.polarity_scores(sentence)
        compound_scores.append(scores['compound'])
        negative_scores.append(scores['neg'])
        positive_scores.append(scores['pos'])
        neutral_scores.append(scores['neu'])

    features["mean_compound"] = np.mean(compound_scores)
    features["mean_negative"] = np.mean(negative_scores)
    features["mean_positive"] = np.mean(positive_scores)
    features["mean_neutral"] = np.mean(neutral_scores)

    features["std_compound"] = np.std(compound_scores)
    features["std_negative"] = np.std(negative_scores)
    features["std_positive"] = np.std(positive_scores)
    features["std_neutral"] = np.std(neutral_scores)

    return features

In [5]:
X['linguistic_features'] = X['full_text'].progress_apply(extract_linguistic_features)
X_linguistic = pd.json_normalize(X['linguistic_features'])

X_linguistic.head()

  0%|          | 0/17307 [00:00<?, ?it/s]

Unnamed: 0,past_tense_ratio,present_tense_ratio,word_count,sentence_count,words_per_sentence,std_words_per_sentence,unique_words,lexical_diversity,paragraph_count,avg_chars_by_paragraph,...,tag_Features.RBR,tag_Features.WDT,tag_Features.EX,tag_Features.MD,tag_Features._SP,tag_Features.NNP,tag_Features.CD,tag_Features.VBG,tag_Features.NNS,tag_Features.PRP
0,0.275362,0.724638,552,13,42.461538,34.078225,248,0.449275,1,2677.0,...,1,4,2,10,6,26,12,6,35,29
1,0.160714,0.839286,377,20,18.85,11.127781,169,0.448276,5,332.2,...,0,6,5,10,4,10,2,10,15,28
2,0.15873,0.84127,611,25,24.44,8.168623,246,0.402619,4,767.75,...,2,9,4,19,4,0,2,11,39,20
3,0.090909,0.909091,516,21,24.571429,10.135141,242,0.468992,5,538.6,...,1,2,2,11,4,20,6,14,27,19
4,0.183673,0.816326,428,16,26.75,18.122845,159,0.371495,6,366.333333,...,3,3,0,3,7,31,4,10,13,10


In [57]:
nlp(X["full_text"][0])

Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won't see a car in Vauban's streets because they are completely "car free" but If some that lives in VAUBAN that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $40,000 along with a home. The vauban people completed this in 2006 ,they said that this an example of a growing trend in Europe,The untile states and some where else ar

In [42]:
len(X["full_text"][1].split('\n\n'))

5

In [59]:
X["full_text"][0]

'Many people have car where they live. The thing they don\'t know is that when you use a car alot of thing can happen\xa0like you can get in accidet or\xa0the smoke that the car has is bad to breath\xa0on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban\'s families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden\xa0on the outskirts of freiburd that near the French and Swiss borders. You probaly won\'t see a car in Vauban\'s streets because they are completely "car free" but\xa0If some that lives in VAUBAN that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $40,000 along with a home. The vauban people completed this in 2006 ,they said that this an example of a growing trend in Europe,The untile states an

In [61]:
doc = nlp(X["full_text"][0])

In [6]:
tag_cols = [col for col in X_linguistic.columns if col.startswith('tag')]
ent_cols = [col for col in X_linguistic.columns if col.startswith('NER')]
pos_cols = [col for col in X_linguistic.columns if col.startswith('POS')]

In [None]:
#pendiente de agregar...
entity_cnt = pd.Series([len(nlp(essay).ents) for essay in X["full_text"]])

In [7]:
for col in tag_cols:
    X_linguistic[f"{col}_ratio"] = X_linguistic[col] / X_linguistic['word_count']
  
#for col in pos_cols:
#    X_linguistic[f"{col}_ratio"] = X_linguistic[col] / X_linguistic['word_count']  

In [9]:
X_linguistic.head()

Unnamed: 0,past_tense_ratio,present_tense_ratio,word_count,sentence_count,words_per_sentence,std_words_per_sentence,unique_words,lexical_diversity,paragraph_count,avg_chars_by_paragraph,...,tag_Features.RBR_ratio,tag_Features.WDT_ratio,tag_Features.EX_ratio,tag_Features.MD_ratio,tag_Features._SP_ratio,tag_Features.NNP_ratio,tag_Features.CD_ratio,tag_Features.VBG_ratio,tag_Features.NNS_ratio,tag_Features.PRP_ratio
0,0.275362,0.724638,552,13,42.461538,34.078225,248,0.449275,1,2677.0,...,0.001812,0.007246,0.003623,0.018116,0.01087,0.047101,0.021739,0.01087,0.063406,0.052536
1,0.160714,0.839286,377,20,18.85,11.127781,169,0.448276,5,332.2,...,0.0,0.015915,0.013263,0.026525,0.01061,0.026525,0.005305,0.026525,0.039788,0.074271
2,0.15873,0.84127,611,25,24.44,8.168623,246,0.402619,4,767.75,...,0.003273,0.01473,0.006547,0.031097,0.006547,0.0,0.003273,0.018003,0.06383,0.032733
3,0.090909,0.909091,516,21,24.571429,10.135141,242,0.468992,5,538.6,...,0.001938,0.003876,0.003876,0.021318,0.007752,0.03876,0.011628,0.027132,0.052326,0.036822
4,0.183673,0.816326,428,16,26.75,18.122845,159,0.371495,6,366.333333,...,0.007009,0.007009,0.0,0.007009,0.016355,0.07243,0.009346,0.023364,0.030374,0.023364


In [8]:
X_merged_features = pd.concat([X_textstat, X_linguistic], axis=1)

In [11]:
X_merged_features.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,...,tag_Features.RBR_ratio,tag_Features.WDT_ratio,tag_Features.EX_ratio,tag_Features.MD_ratio,tag_Features._SP_ratio,tag_Features.NNP_ratio,tag_Features.CD_ratio,tag_Features.VBG_ratio,tag_Features.NNS_ratio,tag_Features.PRP_ratio
0,57.98,14.7,11.7,8.19,18.3,8.74,60,13.0,17.33,9.0,...,0.001812,0.007246,0.003623,0.018116,0.01087,0.047101,0.021739,0.01087,0.063406,0.052536
1,87.55,5.4,6.8,4.99,6.2,6.31,24,6.714286,7.48,7.0,...,0.0,0.015915,0.013263,0.026525,0.01061,0.026525,0.005305,0.026525,0.039788,0.074271
2,65.15,9.9,11.5,8.94,11.6,7.24,67,15.5,11.49,12.0,...,0.003273,0.01473,0.006547,0.031097,0.006547,0.0,0.003273,0.018003,0.06383,0.032733
3,58.32,10.4,13.2,10.97,12.9,8.5,78,15.75,11.91,11.0,...,0.001938,0.003876,0.003876,0.021318,0.007752,0.03876,0.011628,0.027132,0.052326,0.036822
4,54.66,11.8,13.0,10.57,13.9,7.79,55,19.666667,12.64,13.0,...,0.007009,0.007009,0.0,0.007009,0.016355,0.07243,0.009346,0.023364,0.030374,0.023364


In [9]:
#Drop tag Features unnormalized
tag_cols_to_drop = [col for col in X_linguistic.columns if (col.startswith("tag") and not col.endswith("ratio"))]
X_merged_features.drop(tag_cols_to_drop,axis=1,inplace=True)
X_merged_features.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,...,tag_Features.RBR_ratio,tag_Features.WDT_ratio,tag_Features.EX_ratio,tag_Features.MD_ratio,tag_Features._SP_ratio,tag_Features.NNP_ratio,tag_Features.CD_ratio,tag_Features.VBG_ratio,tag_Features.NNS_ratio,tag_Features.PRP_ratio
0,57.98,14.7,11.7,8.19,18.3,8.74,60,13.0,17.33,9.0,...,0.001812,0.007246,0.003623,0.018116,0.01087,0.047101,0.021739,0.01087,0.063406,0.052536
1,87.55,5.4,6.8,4.99,6.2,6.31,24,6.714286,7.48,7.0,...,0.0,0.015915,0.013263,0.026525,0.01061,0.026525,0.005305,0.026525,0.039788,0.074271
2,65.15,9.9,11.5,8.94,11.6,7.24,67,15.5,11.49,12.0,...,0.003273,0.01473,0.006547,0.031097,0.006547,0.0,0.003273,0.018003,0.06383,0.032733
3,58.32,10.4,13.2,10.97,12.9,8.5,78,15.75,11.91,11.0,...,0.001938,0.003876,0.003876,0.021318,0.007752,0.03876,0.011628,0.027132,0.052326,0.036822
4,54.66,11.8,13.0,10.57,13.9,7.79,55,19.666667,12.64,13.0,...,0.007009,0.007009,0.0,0.007009,0.016355,0.07243,0.009346,0.023364,0.030374,0.023364


## Mispelled Words Counter

In [10]:
spell = SpellChecker()

def spell_check(text):
    words = nltk.word_tokenize(text)
    misspelled = spell.unknown(words)

    mispelled_count = len(misspelled)
    misspelled_ratio = mispelled_count / len(words)

    return mispelled_count, misspelled_ratio

X['spell_check_features'] = X['full_text'].progress_apply(spell_check)

X_spell_check_df = pd.DataFrame(X['spell_check_features'].tolist(), columns=['misspelled_count', 'misspelled_ratio'])

X_spell_check_df.head()

  0%|          | 0/17307 [00:00<?, ?it/s]

Unnamed: 0,misspelled_count,misspelled_ratio
0,30,0.055046
1,13,0.03504
2,12,0.019835
3,15,0.029354
4,15,0.035885


In [11]:
X_merged_features = pd.concat([X_merged_features,X_spell_check_df ],axis=1)

In [15]:
X_merged_features.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,...,tag_Features.EX_ratio,tag_Features.MD_ratio,tag_Features._SP_ratio,tag_Features.NNP_ratio,tag_Features.CD_ratio,tag_Features.VBG_ratio,tag_Features.NNS_ratio,tag_Features.PRP_ratio,misspelled_count,misspelled_ratio
0,57.98,14.7,11.7,8.19,18.3,8.74,60,13.0,17.33,9.0,...,0.003623,0.018116,0.01087,0.047101,0.021739,0.01087,0.063406,0.052536,30,0.055046
1,87.55,5.4,6.8,4.99,6.2,6.31,24,6.714286,7.48,7.0,...,0.013263,0.026525,0.01061,0.026525,0.005305,0.026525,0.039788,0.074271,13,0.03504
2,65.15,9.9,11.5,8.94,11.6,7.24,67,15.5,11.49,12.0,...,0.006547,0.031097,0.006547,0.0,0.003273,0.018003,0.06383,0.032733,12,0.019835
3,58.32,10.4,13.2,10.97,12.9,8.5,78,15.75,11.91,11.0,...,0.003876,0.021318,0.007752,0.03876,0.011628,0.027132,0.052326,0.036822,15,0.029354
4,54.66,11.8,13.0,10.57,13.9,7.79,55,19.666667,12.64,13.0,...,0.0,0.007009,0.016355,0.07243,0.009346,0.023364,0.030374,0.023364,15,0.035885


# DataFrame Preparation For Training

Prepare the DataFrame using standard normalization.
Columns to normalize : TextStat Columns, NER_features, POS_features

In [12]:
cols_ner_pos = [col for col in X_merged_features.columns if (col.startswith("NER") or col.startswith("POS"))]
cols_to_revise = cols_ner_pos + X_textstat.columns.tolist() +  ["word_count", "sentence_count", "words_per_sentence", "unique_words", "paragraph_count", "misspelled_count"]
X_merged_features.loc[:,cols_to_revise]

Unnamed: 0,NER_Features.GPE,NER_Features.PERCENT,NER_Features.NORP,NER_Features.ORG,NER_Features.CARDINAL,NER_Features.MONEY,NER_Features.DATE,NER_Features.LOC,NER_Features.PERSON,NER_Features.QUANTITY,...,reading_time,syllable_count,lexicon_count,monosyllabcount,word_count,sentence_count,words_per_sentence,unique_words,paragraph_count,misspelled_count
0,4,5,2,3,2,1,3,2,3,1,...,31.97,634,498,404,552,13,42.461538,248,1,30
1,2,0,1,5,1,0,0,2,0,0,...,19.60,398,332,275,377,20,18.850000,169,5,13
2,0,0,0,0,1,0,1,0,0,0,...,36.96,767,550,417,611,25,24.440000,246,4,12
3,0,1,3,2,2,0,0,15,1,3,...,33.01,685,448,291,516,21,24.571429,242,5,15
4,0,0,2,14,2,0,1,0,0,0,...,26.71,562,373,241,428,16,26.750000,159,6,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17302,0,0,0,0,0,0,0,1,0,0,...,10.25,212,155,115,170,9,18.888889,99,3,14
17303,0,5,0,2,2,0,4,0,1,0,...,40.57,845,579,386,650,26,25.000000,263,6,22
17304,0,0,0,1,0,0,1,0,1,0,...,13.31,262,215,177,237,15,15.800000,115,3,4
17305,0,1,0,0,2,0,1,17,1,0,...,17.58,339,230,149,270,11,24.545455,143,1,12


In [13]:
X_merged_features[cols_to_revise] = preprocessing.StandardScaler().fit_transform(X_merged_features[cols_to_revise].to_numpy())
X_merged_features.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,...,tag_Features.EX_ratio,tag_Features.MD_ratio,tag_Features._SP_ratio,tag_Features.NNP_ratio,tag_Features.CD_ratio,tag_Features.VBG_ratio,tag_Features.NNS_ratio,tag_Features.PRP_ratio,misspelled_count,misspelled_ratio
0,-0.783087,1.161382,0.852808,-0.065458,1.201302,1.127179,0.590953,0.552361,1.322129,-0.05055,...,0.003623,0.018116,0.01087,0.047101,0.021739,0.01087,0.063406,0.052536,1.77199,0.055046
1,1.123514,-0.607084,-1.793394,-1.845826,-0.61141,-1.171193,-0.925567,-0.707276,-0.516059,-0.493072,...,0.013263,0.026525,0.01061,0.026525,0.005305,0.026525,0.039788,0.074271,-0.264466,0.03504
2,-0.320783,0.248625,0.7448,0.351816,0.197569,-0.291569,0.885832,1.053353,0.23228,0.613233,...,0.006547,0.031097,0.006547,0.0,0.003273,0.018003,0.06383,0.032733,-0.384258,0.019835
3,-0.761165,0.343704,1.66287,1.481237,0.392323,0.900179,1.349213,1.103452,0.310659,0.391972,...,0.003876,0.021318,0.007752,0.03876,0.011628,0.027132,0.052326,0.036822,-0.024883,0.029354
4,-0.997153,0.609925,1.554861,1.258691,0.542134,0.228639,0.380325,1.888339,0.44689,0.834494,...,0.0,0.007009,0.016355,0.07243,0.009346,0.023364,0.030374,0.023364,-0.024883,0.035885


# Train simple Linear Regression varying the number of relevant features to keep 

In [38]:
X_merged_features["score"] = y #+ 1

In [175]:
def quadratic_weighted_kappa(y_true, y_pred):
    y_pred = (y_pred).clip(0, 5).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return {'QWK' : qwk}

In [45]:
corr = X_merged_features.corr()["score"].drop("score",axis=0)

In [46]:
corr

syllable_count                  0.706578
reading_time                    0.705700
word_count                      0.692587
lexicon_count                   0.691308
POS_Features.DET                0.652708
                                  ...   
dale_chall_readability_score   -0.149772
tag_Features.VBD_ratio         -0.173703
tag_Features.PRP_ratio         -0.237266
misspelled_ratio               -0.369662
lexical_diversity              -0.550713
Name: score, Length: 119, dtype: float64

In [75]:
corr_df = pd.DataFrame({
    "OG" : corr,
    "ABS" : corr.abs()    
}).sort_values(["ABS"],ascending=False)

In [79]:
corr_df.head()

Unnamed: 0,OG,ABS
syllable_count,0.706578,0.706578
reading_time,0.7057,0.7057
word_count,0.692587,0.692587
lexicon_count,0.691308,0.691308
POS_Features.DET,0.652708,0.652708


In [51]:
seed = 42

## Train ML models with features: with correlation >= 0.5 | <= -0.5

In [19]:
kappa_scorer = make_scorer(cohen_kappa_score,labels=np.arange(1,7) ,weights="quadratic")

In [99]:
features_1 = corr_df[corr_df["ABS"] >= 0.5]["OG"].index.to_list()
rf_clf = RandomForestClassifier(random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_1], y, cv=10,scoring = kappa_scorer)
print(f"Random Forest scores : {scores}")
print(f"Random Forest score mean: {scores.mean()}, std dev : {scores.std()}")


svm_clf = SVC(kernel='poly', C=3, random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_1], y, cv=10,scoring = kappa_scorer)
print(f"SVM Forest scores : {scores}")
print(f"SVM Forest score mean: {scores.mean()}, std dev : {scores.std()}")

Random Forest scores : [0.6771532  0.71765826 0.70629679 0.70423197 0.6801317  0.68346955
 0.6978672  0.72475472 0.71194403 0.69336201]
Random Forest score mean: 0.6996869429059611, std dev : 0.015370238386737956
SVM Forest scores : [0.6771532  0.71765826 0.70629679 0.70423197 0.6801317  0.68346955
 0.6978672  0.72475472 0.71194403 0.69336201]
SVM Forest score mean: 0.6996869429059611, std dev : 0.015370238386737956


## Train ML models with features: with correlation >= 0.3 | <= -0.3

In [100]:
features_2 = corr_df[corr_df["ABS"] >= 0.3]["OG"].index.to_list()
rf_clf = RandomForestClassifier(random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_2], y, cv=10,scoring = kappa_scorer)
print(f"Random Forest scores : {scores}")
print(f"Random Forest score mean: {scores.mean()}, std dev : {scores.std()}")


svm_clf = SVC(kernel='poly', C=3, random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_2], y, cv=10,scoring = kappa_scorer)
print(f"SVM Forest scores : {scores}")
print(f"SVM Forest score mean: {scores.mean()}, std dev : {scores.std()}")

Random Forest scores : [0.69282582 0.7245832  0.73052568 0.71716993 0.69156261 0.69029627
 0.71480986 0.73164391 0.71355912 0.697185  ]
Random Forest score mean: 0.7104161421145841, std dev : 0.015411883008933161
SVM Forest scores : [0.69282582 0.7245832  0.73052568 0.71716993 0.69156261 0.69029627
 0.71480986 0.73164391 0.71355912 0.697185  ]
SVM Forest score mean: 0.7104161421145841, std dev : 0.015411883008933161


## Train ML models with features: with correlation >= 0.1 | <= -0.1

In [101]:
features_3 = corr_df[corr_df["ABS"] >= 0.1]["OG"].index.to_list()
rf_clf = RandomForestClassifier(random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_3], y, cv=10,scoring = kappa_scorer)
print(f"Random Forest scores : {scores}")
print(f"Random Forest score mean: {scores.mean()}, std dev : {scores.std()}")


svm_clf = SVC(kernel='poly', C=3, random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_3], y, cv=10,scoring = kappa_scorer)
print(f"SVM Forest scores : {scores}")
print(f"SVM Forest score mean: {scores.mean()}, std dev : {scores.std()}")

Random Forest scores : [0.70278127 0.74299517 0.73761263 0.73063994 0.71292133 0.70348175
 0.7318437  0.73512836 0.72886109 0.71531836]
Random Forest score mean: 0.7241583624130724, std dev : 0.013668551485825705
SVM Forest scores : [0.70278127 0.74299517 0.73761263 0.73063994 0.71292133 0.70348175
 0.7318437  0.73512836 0.72886109 0.71531836]
SVM Forest score mean: 0.7241583624130724, std dev : 0.013668551485825705


## Train ML models with features: with correlation >= 0.0 | <= -0.0

In [102]:
features_4 = corr_df[corr_df["ABS"] >= 0.0]["OG"].index.to_list()
rf_clf = RandomForestClassifier(random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_4], y, cv=10,scoring = kappa_scorer)
print(f"Random Forest scores : {scores}")
print(f"Random Forest score mean: {scores.mean()}, std dev : {scores.std()}")


svm_clf = SVC(kernel='poly', C=3, random_state=seed)
scores = cross_val_score(rf_clf, X_merged_features[features_4], y, cv=10,scoring = kappa_scorer)
print(f"SVM Forest scores : {scores}")
print(f"SVM Forest score mean: {scores.mean()}, std dev : {scores.std()}")

Random Forest scores : [0.69195473 0.74074923 0.73493334 0.72286607 0.71283003 0.70064653
 0.73517278 0.74015081 0.73272443 0.71631946]
Random Forest score mean: 0.7228347404578817, std dev : 0.01614193399821272
SVM Forest scores : [0.69195473 0.74074923 0.73493334 0.72286607 0.71283003 0.70064653
 0.73517278 0.74015081 0.73272443 0.71631946]
SVM Forest score mean: 0.7228347404578817, std dev : 0.01614193399821272


In [196]:
def quadratic_weighted_kappa(y_true, y_pred):
    y_true = (y_true + a).clip(0, 5).round()
    y_pred = (y_pred + a).clip(0, 5).round()
    qwk = cohen_kappa_score(y_true, y_pred,labels= np.arange(6), weights="quadratic")
    return 'QWK', qwk, True


# metric and objective based on public notebooks

def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess
a = 2.998
b = 1.092

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

scores = []


for fold, (train_idx, valid_idx) in enumerate(skf.split(X['full_text'], y)):
    print(f"Fold: {fold}")
    print(f"Train size: {len(train_idx)}")
    print(f"Valid size: {len(valid_idx)}")
    print()

    X_train = X_merged_features[features_1].iloc[train_idx].values
    X_valid = X_merged_features[features_1].iloc[valid_idx].values


    y_train = X_merged_features['score'].values[train_idx]
    y_valid = X_merged_features['score'].values[valid_idx]


    y_train = y_train -a
    y_valid = y_valid -a


 

    model = lgb.LGBMRegressor(
                objective = qwk_obj,
                metrics = 'None',
                learning_rate = 0.01,
                n_estimators=10000,
                random_state=42,
                extra_trees=True,
                class_weight='balanced',
                verbosity = - 1)
    
    callbacks = [lgb.early_stopping(500, verbose=True, first_metric_only=True), lgb.log_evaluation(period=500)]

    
    predictor = model.fit(X_train,
                                  y_train,
                                  eval_names=['train', 'valid'],
                                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                                  eval_metric=quadratic_weighted_kappa,
                                  callbacks=callbacks,)

    valid_preds = predictor.predict(X_valid)

    X.loc[valid_idx, 'valid_1'] = valid_preds + a

    score = quadratic_weighted_kappa(y_valid, valid_preds)
    scores.append(score[1])
    
    print(f"Train QWK: {score}")

print(f"Mean QWK: {np.mean(scores)}")

Fold: 0
Train size: 15576
Valid size: 1731

[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 500 rounds
[500]	train's QWK: 1	valid's QWK: 1
Early stopping, best iteration is:
[285]	train's QWK: 1	valid's QWK: 1
Evaluated only: QWK
Train QWK: ('QWK', 1.0, True)
Fold: 1
Train size: 15576
Valid size: 1731

[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 500 rounds
[500]	train's QWK: 1	valid's QWK: 1
Early stopping, best iteration is:
[299]	train's QWK: 1	valid's QWK: 1
Evaluated only: QWK
Train QWK: ('QWK', 1.0, True)
Fold: 2
Train size: 15576
Valid size: 1731

[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 500 rounds
[500]	train's QWK: 1	valid's QWK: 1
Early stopping, best iteration is:
[267]	train's QWK: 0.999724	valid's QWK: 1
Evaluated only: QWK
Train QWK: ('QWK', 1.0, True)
Fold: 3
Train size: 15576
Valid size