# Imports

In [55]:
import pandas as pd
import nltk
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from language_tool_python import LanguageTool
from sklearn.feature_extraction.text import CountVectorizer
import textstat
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim import models
from gensim import similarities
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [14]:
# NLTK RESOURCES

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\habee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

# Accessing dataset

In [9]:
ex = pd.read_csv('DATA_SET/final_test.csv')
ex.head(-1)

ex.to_csv('DATA_SET/test.csv', index=False)

In [3]:
df = pd.read_csv('DATA_SET/pre_test.csv')
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,adj_count,pro_count
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,22,27
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,17,46
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,19,4
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,15,22
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16,48,15,33,15,12
...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44,186,18,86,45,10
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19,126,34,87,29,34
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78,98,28,48,36,11
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40,110,9,67,33,16


# Data preprocessing

In [11]:
# Handling missing values

print(df.isna().sum())

df = df.dropna()

text     0
label    0
dtype: int64


In [15]:
# Data Cleaning

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization and removing stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    # Stemming (optional)
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(clean_text)

# Feature engineering

In [16]:
# Basic NLP Features

df['char_count'] = df['text'].apply(len) # char count

df['word_count'] = df['text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['text'].apply(lambda x: parts_of_speech(x))

In [16]:
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,adj_count,pro_count,error_length,ner_count
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,22,27,1,1
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,17,46,9,16
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,19,4,1,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,15,22,4,9
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16,48,15,33,15,12,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44,186,18,86,45,10,0,17
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19,126,34,87,29,34,0,0
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78,98,28,48,36,11,7,24
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40,110,9,67,33,16,4,12


In [4]:
# Text error length - grammer mistakes

tool = LanguageTool('en-US') 

def error_length(text):
    matches = tool.check(text)
    return len(matches)

df['error_length'] = df['text'].apply(error_length)

In [15]:
# Named Entity Recognition (NER)

def ner_count(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ner_tags = ne_chunk(pos_tags)
    ner_count = sum(1 for chunk in ner_tags if hasattr(chunk, 'label'))
    return ner_count

df['ner_count'] = df['text'].apply(ner_count)

In [22]:
# Readability score

df['flesch_kincaid_score'] = df['text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
df['flesch_score'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['gunning_fog_score'] = df['text'].apply(lambda x: textstat.gunning_fog(x))
df['coleman_liau_score'] = df['text'].apply(lambda x: textstat.coleman_liau_index(x))
df['dale_chall_score'] = df['text'].apply(lambda x: textstat.dale_chall_readability_score(x))
df['ari_score'] = df['text'].apply(lambda x: textstat.automated_readability_index(x))
df['linsear_write_score'] = df['text'].apply(lambda x: textstat.linsear_write_formula(x))
df['spache_score'] = df['text'].apply(lambda x: textstat.spache_readability(x))

In [32]:
# Topic modeling - LDA

corpus = [text.split() for text in df['text']]

dictionary = corpora.Dictionary(corpus)

corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Train LDA model
num_topics = 20
lda_model = LdaModel(corpus_bow, num_topics=num_topics, id2word=dictionary, passes=15)

# Extract topic
topic_distribution = lda_model.get_document_topics(corpus_bow)

for topic in range(num_topics):
    df[f'topic_{topic + 1}_score'] = [t[1] if t[0] == topic else 0 for t in topic_distribution]

In [34]:
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,...,topic_11_score,topic_12_score,topic_13_score,topic_14_score,topic_15_score,topic_16_score,topic_17_score,topic_18_score,topic_19_score,topic_20_score
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16,48,15,33,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44,186,18,86,...,0,0,0,0,0,0,0,0,0,0
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19,126,34,87,...,0,0,0,0,0,0,0,0,0,0
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78,98,28,48,...,0,0,0,0,0,0,0,0,0,0
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40,110,9,67,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Count Vectorizer

count_vect = df['text'].tolist()

count_vectorizer = CountVectorizer()

count_matrix = count_vectorizer.fit_transform(count_vect)

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

df = pd.concat([df, count_df], axis=1)


In [36]:
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,...,zse,zt,zuckerberg,zukor,zur,zurthermore,zveryone,zygotic,zzut,âºâ
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16,48,15,33,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44,186,18,86,...,0,0,0,0,0,0,0,0,0,0
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19,126,34,87,...,0,0,0,0,0,0,0,0,0,0
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78,98,28,48,...,0,0,0,0,0,0,0,0,0,0
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40,110,9,67,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Bigram Vectorizer

bigram_vect = count_vect

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)

bigram_matrix = bigram_vectorizer.fit_transform(bigram_vect)

bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

df = pd.concat([df, bigram_df], axis=1)

In [42]:
# Trigram Vectorizer

trigram_vect = count_vect

trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=5000)

trigram_matrix = trigram_vectorizer.fit_transform(trigram_vect)

trigram_df = pd.DataFrame(trigram_matrix.toarray(), columns=trigram_vectorizer.get_feature_names_out())

df = pd.concat([df, trigram_df], axis=1)

In [43]:
# Bi-Trigram Vectorizer

bitri_vect = count_vect

bitri_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)

bichar_matrix = bitri_vectorizer.fit_transform(bitri_vect)

bichar_df = pd.DataFrame(bichar_matrix.toarray(), columns=bitri_vectorizer.get_feature_names_out())

df = pd.concat([df, bichar_df], axis=1)


In [45]:
df.head(-1)

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,upper_case_count,noun_count,adv_count,verb_count,...,t,th,to,w,á,"á,","á,.1",’s,’s.1,”
0,I think that animals should not be used in sci...,1,1241,241,0.194198,26,15,40,15,43,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,2180,435,0.199541,44,89,103,18,82,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,1162,192,0.165232,17,9,60,7,23,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,1114,242,0.217235,39,63,46,16,37,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,1025,214,0.208780,27,16,48,15,33,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Limiting car usage has numerous advantages tha...,1,3304,569,0.172215,67,44,186,18,86,...,0,0,0,0,0,1,1,0,0,0
1995,Should distant learning to be offered for stud...,0,2708,511,0.188700,52,19,126,34,87,...,0,0,0,0,0,0,0,0,0,0
1996,Clearly with all tie evidence that I AVK prove...,0,1660,339,0.204217,43,78,98,28,48,...,0,0,0,0,0,0,0,0,0,0
1997,"Throughout history, different types of explora...",0,2383,446,0.187159,51,40,110,9,67,...,0,0,0,0,0,0,0,0,0,0


# Saving the dataset

In [50]:
df.to_csv('DATA_SET/pre_test.csv', mode='a', header=False, index=False)

# Training the model

In [48]:
# Support Vector Machine (SVM)

y = df['label']

# X = df.drop(['text', 'label', 'cleaned_text'], axis=1)
X = df.drop(['text', 'label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       251
           1       1.00      0.13      0.23       149

    accuracy                           0.68       400
   macro avg       0.83      0.56      0.51       400
weighted avg       0.79      0.68      0.58       400



In [56]:
# Random Forest Classifier

y = df['label']

# Features (excluding columns not used for training)
X = df.drop(['label', 'text'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       251
           1       0.99      0.89      0.94       149

    accuracy                           0.96       400
   macro avg       0.97      0.94      0.95       400
weighted avg       0.96      0.96      0.96       400



# Saving the model

In [52]:
joblib.dump(model, 'svm_model.joblib')

['svm_model.joblib']