In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
from nltk import pos_tag, ne_chunk
from gensim import corpora
from gensim.models import LdaModel
import textstat
from language_tool_python import LanguageTool
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
df = pd.read_csv('DATA_SET/final_test_v2.csv')

### Basic NLP Features

In [78]:
df['char_count'] = df['cleaned_text'].apply(len)

df['word_count'] = df['cleaned_text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

def title_word_count(text):
    return sum(1 for word in text.split() if word.istitle())

df['title_word_count'] = df['text'].apply(title_word_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['cleaned_text'].apply(lambda x: parts_of_speech(x))

### Topic Modeling

In [80]:
corpus = [text.split() for text in df['cleaned_text']]

dictionary = corpora.Dictionary(corpus)

corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Training
num_topics = 20
lda_model = LdaModel(corpus_bow, num_topics=num_topics, id2word=dictionary, passes=15)

topic_distribution = lda_model.get_document_topics(corpus_bow)

for topic in range(num_topics):
    df[f'topic_{topic + 1}_score'] = [next((t[1] for t in topic_dist if t[0] == topic), 0) for topic_dist in topic_distribution]

In [81]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_11_score,topic_12_score,topic_13_score,topic_14_score,topic_15_score,topic_16_score,topic_17_score,topic_18_score,topic_19_score,topic_20_score


### Readability Scores

In [82]:
df['flesch_kincaid_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))

df['flesch_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_reading_ease(x))

df['gunning_fog_score'] = df['cleaned_text'].apply(lambda x: textstat.gunning_fog(x))

df['coleman_liau_score'] = df['cleaned_text'].apply(lambda x: textstat.coleman_liau_index(x))

df['dale_chall_score'] = df['cleaned_text'].apply(lambda x: textstat.dale_chall_readability_score(x))

df['ari_score'] = df['cleaned_text'].apply(lambda x: textstat.automated_readability_index(x))

df['linsear_write_score'] = df['cleaned_text'].apply(lambda x: textstat.linsear_write_formula(x))

df['spache_score'] = df['cleaned_text'].apply(lambda x: textstat.spache_readability(x))

In [83]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_19_score,topic_20_score,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score


### Named Entity Recognition

In [84]:
def ner_count(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ner_tags = ne_chunk(pos_tags)
    ner_count = sum(1 for chunk in ner_tags if hasattr(chunk, 'label'))
    return ner_count

df['ner_count'] = df['text'].apply(ner_count)

### Text error length

In [85]:
tool = LanguageTool('en-US')

def error_length(text):
    matches = tool.check(text)
    return len(matches)

df['error_length'] = df['text'].apply(error_length)

In [6]:
df.head(-1)

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,...,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score,ner_count,error_length
0,The Face on Mars is nothing but a natural occu...,0,the face on mars is nothing but a natural occu...,1303,241,0.184958,64,42,31,65,...,94.9,-156.23,99.22,9.78,19.46,119.8,63.0,36.28,12,31
1,Students have a higher chance of catching a vi...,0,students have a higher chance of catching a vi...,3456,605,0.175058,62,40,39,149,...,236.9,-525.69,243.26,11.52,35.60,303.3,58.0,86.91,4,60
2,Driverless cars have good and bad things that ...,0,driverless cars have good and bad things that ...,1629,305,0.187231,26,22,22,55,...,118.7,-212.73,123.44,9.37,20.78,151.5,53.0,44.69,5,21
3,Some people might think that traveling in a gr...,1,some people might think that traveling in a gr...,328,67,0.204268,6,5,5,11,...,24.7,37.31,28.59,6.58,8.61,30.5,38.5,10.93,0,6
4,How many of us students want to be forced to d...,0,how many of us students want to be forced to d...,3431,612,0.178374,62,27,25,133,...,240.8,-541.25,247.35,10.94,36.26,306.3,64.0,88.21,4,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86581,"Dear ( Senator of Florida ) , In my opinion I ...",0,dear senator of florida in my opinion i believ...,2550,477,0.187059,31,30,30,94,...,185.8,-387.31,192.56,9.43,29.58,237.6,57.0,69.07,11,34
86582,Dear Principal : I think we should have cell p...,0,dear principal i think we should have cell pho...,1718,334,0.194412,35,17,15,77,...,128.8,-233.70,135.04,8.27,22.00,165.1,59.0,48.50,1,34
86583,Dear Teacher_NAME I think that if you try to s...,0,dear teachername i think that if you try to st...,1516,267,0.176121,27,36,20,60,...,103.9,-174.16,108.45,11.34,18.89,134.1,51.0,39.26,1,27
86584,Venus is sometimes called the `` meaning Star ...,0,venus is sometimes called the meaning star its...,878,166,0.189066,21,17,17,41,...,64.5,-71.64,68.33,8.84,15.39,81.8,56.0,25.75,8,17


### Count Vectorizer

In [None]:
count_vect = df['cleaned_text'].tolist()

count_vectorizer = CountVectorizer(max_features=5000)

count_matrix = count_vectorizer.fit_transform(count_vect)

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

df = pd.concat([df, count_df], axis=1)

In [None]:
joblib.dump(count_vectorizer, 'tools/count_vectorizer_50k.pkl')

### N-Grams

In [None]:
# Bigram Vectorizer

bigram_vect = df['cleaned_text'].tolist()

bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=5000)

bigram_matrix = bigram_vectorizer.fit_transform(bigram_vect)

bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

df = pd.concat([df, bigram_df], axis=1)

In [None]:
joblib.dump(bigram_vectorizer, 'tools/bigram_vectorizer_50k.pkl')

In [None]:
# Trigram Vectorizer

trigram_vect = df['cleaned_text'].tolist()

trigram_vectorizer = TfidfVectorizer(ngram_range=(3, 3), max_features=5000)

trigram_matrix = trigram_vectorizer.fit_transform(trigram_vect)

trigram_df = pd.DataFrame(trigram_matrix.toarray(), columns=trigram_vectorizer.get_feature_names_out())

df = pd.concat([df, trigram_df], axis=1)

In [None]:
joblib.dump(trigram_vectorizer, 'tools/trigram_vectorizer_50k.pkl')

In [None]:
# Bi-Trigram Vectorizer

bitri_vect = df['cleaned_text'].str.strip().tolist()

bitri_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)

bichar_matrix = bitri_vectorizer.fit_transform(bitri_vect)

bichar_df = pd.DataFrame(bichar_matrix.toarray(), columns=bitri_vectorizer.get_feature_names_out())

bichar_df.columns = bichar_df.columns.str.strip()

bichar_df = bichar_df.loc[:, ~bichar_df.columns.duplicated()]

df = pd.concat([df, bichar_df], axis=1)

In [None]:
joblib.dump(bitri_vectorizer, 'tools/bitri_vectorizer_v2.pkl')

In [None]:
# Lexical Diversity

ttr_list = [len(set(word_tokenize(text.lower()))) / len(word_tokenize(text.lower())) for text in df['cleaned_text']]

ttr_df = pd.DataFrame({'lexical_diversity': ttr_list})

df['lexical_diversity'] = ttr_df['lexical_diversity']

### Save dataset

In [98]:
df.to_csv('DATA_SET/final_test_v2.csv', index=False)