In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk import pos_tag, ne_chunk
from gensim import corpora
from gensim.models import LdaModel
import textstat
from language_tool_python import LanguageTool
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# NLTK RESOURCES

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
df = pd.read_csv('input.csv')
df.head(-1)

Unnamed: 0,text,cleaned_text


### Basic NLP Features

In [4]:
df['char_count'] = df['cleaned_text'].apply(len) # char count

df['word_count'] = df['cleaned_text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

def title_word_count(text):
    return sum(1 for word in text.split() if word.istitle())

df['title_word_count'] = df['text'].apply(title_word_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['cleaned_text'].apply(lambda x: parts_of_speech(x))

In [5]:
df.head()

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,verb_count,adj_count,pro_count
0,"In December 2016 , Google Duo replaced Hangout...",in december google duo replaced hangouts withi...,2295,391,0.17037,111,101,79,133,14,75,29,3


### Topic Modeling

In [6]:
corpus = [text.split() for text in df['cleaned_text']]

dictionary = corpora.Dictionary(corpus)

corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Training
num_topics = 20
lda_model = LdaModel(corpus_bow, num_topics=num_topics, id2word=dictionary, passes=15)

topic_distribution = lda_model.get_document_topics(corpus_bow)

for topic in range(num_topics):
    df[f'topic_{topic + 1}_score'] = [next((t[1] for t in topic_dist if t[0] == topic), 0) for topic_dist in topic_distribution]

In [7]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_11_score,topic_12_score,topic_13_score,topic_14_score,topic_15_score,topic_16_score,topic_17_score,topic_18_score,topic_19_score,topic_20_score


### Readability Scores

In [8]:
df['flesch_kincaid_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))

df['flesch_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_reading_ease(x))

df['gunning_fog_score'] = df['cleaned_text'].apply(lambda x: textstat.gunning_fog(x))

df['coleman_liau_score'] = df['cleaned_text'].apply(lambda x: textstat.coleman_liau_index(x))

df['dale_chall_score'] = df['cleaned_text'].apply(lambda x: textstat.dale_chall_readability_score(x))

df['ari_score'] = df['cleaned_text'].apply(lambda x: textstat.automated_readability_index(x))

df['linsear_write_score'] = df['cleaned_text'].apply(lambda x: textstat.linsear_write_formula(x))

df['spache_score'] = df['cleaned_text'].apply(lambda x: textstat.spache_readability(x))

In [9]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_19_score,topic_20_score,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score


### Named Entity Recognition

In [10]:
def ner_count(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ner_tags = ne_chunk(pos_tags)
    ner_count = sum(1 for chunk in ner_tags if hasattr(chunk, 'label'))
    return ner_count

df['ner_count'] = df['text'].apply(ner_count)

In [11]:
df.head()

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_20_score,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score,ner_count
0,"In December 2016 , Google Duo replaced Hangout...",in december google duo replaced hangouts withi...,2295,391,0.17037,111,101,79,133,14,...,0,153.4,-308.48,160.29,12.45,27.47,197.0,62.0,57.73,43


### Text error length

In [12]:
tool = LanguageTool('en-US')

def error_length(text):
    matches = tool.check(text)
    return len(matches)

df['error_length'] = df['text'].apply(error_length)

In [13]:
df.head()

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score,ner_count,error_length
0,"In December 2016 , Google Duo replaced Hangout...",in december google duo replaced hangouts withi...,2295,391,0.17037,111,101,79,133,14,...,153.4,-308.48,160.29,12.45,27.47,197.0,62.0,57.73,43,87


### Count Vectorization

In [14]:
# tf = df

# count_vect = df['cleaned_text'].tolist()

# count_vectorizer = CountVectorizer(max_features=10000, stop_words='english')

# count_matrix = count_vectorizer.fit_transform(count_vect)

# count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# df = pd.concat([df, count_df], axis=1)

In [15]:
# tf.head(-1)

### N - Grams

In [16]:
# # Bigram Vectorizer

# bigram_vect = df['cleaned_text'].tolist()

# bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)

# bigram_matrix = bigram_vectorizer.fit_transform(bigram_vect)

# bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

# df = pd.concat([df, bigram_df], axis=1)

In [17]:
# # Trigram Vectorizer

# trigram_vect = df['cleaned_text'].tolist()

# trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=5000)

# trigram_matrix = trigram_vectorizer.fit_transform(trigram_vect)

# trigram_df = pd.DataFrame(trigram_matrix.toarray(), columns=trigram_vectorizer.get_feature_names_out())

# df = pd.concat([df, trigram_df], axis=1)

In [18]:
# Bi-Trigram Vectorizer

# bitri_vect = df['cleaned_text'].tolist()

# bitri_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)

# bichar_matrix = bitri_vectorizer.fit_transform(bitri_vect)

# bichar_df = pd.DataFrame(bichar_matrix.toarray(), columns=bitri_vectorizer.get_feature_names_out())

# df = pd.concat([df, bichar_df], axis=1)

In [19]:
df.head()

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score,ner_count,error_length
0,"In December 2016 , Google Duo replaced Hangout...",in december google duo replaced hangouts withi...,2295,391,0.17037,111,101,79,133,14,...,153.4,-308.48,160.29,12.45,27.47,197.0,62.0,57.73,43,87


### Save dataset

In [20]:
df.to_csv('input.csv', index=False)