# Imports

In [1]:
import pandas as pd
import nltk
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from language_tool_python import LanguageTool
from sklearn.feature_extraction.text import CountVectorizer
import textstat
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim import models
from gensim import similarities
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [12]:
# NLTK RESOURCES

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

# Accessing dataset

In [13]:
# original_dataset = pd.read_csv('DATA_SET/final_test.csv')

# subset_dataset = original_dataset.sample(n=10000, random_state=42)

# subset_dataset.to_csv('subset_dataset.csv', index=False)

In [14]:
df = pd.read_csv('subset_dataset.csv')
df.head(-1)

Unnamed: 0,text,label
0,I think that animals should not be used in sci...,1
1,Focus On The ROAS\n\nDo you think you should b...,0
2,Taking online or watching video conferencing c...,1
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0
4,Driverless cars would be a Wood idea. It would...,0
...,...,...
9994,"Throughout Americas history, people have alway...",0
9995,Some schools other distance learning as an opt...,0
9996,"Hey there! So, you know how people always say...",1
9997,"I agree that in twenty years, there will be fe...",1


# Data preprocessing

In [15]:
# Handling missing values

print(df.isna().sum())

df = df.dropna()

text     0
label    0
dtype: int64


In [16]:
# Data Cleaning

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization and removing stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    # Stemming (optional)
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(clean_text)

# Feature engineering

In [17]:
# Basic NLP Features

df['char_count'] = df['cleaned_text'].apply(len) # char count

df['word_count'] = df['cleaned_text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

def title_word_count(text):
    return sum(1 for word in text.split() if word.istitle())

df['title_word_count'] = df['text'].apply(title_word_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['cleaned_text'].apply(lambda x: parts_of_speech(x))

In [18]:
df.head(-1)

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,verb_count,adj_count,pro_count
0,I think that animals should not be used in sci...,1,think anim use scientif research wrong use ben...,622,102,0.163987,26,15,15,59,10,11,16,1
1,Focus On The ROAS\n\nDo you think you should b...,0,focu roa think servic drive vehicl danger an p...,1120,196,0.175000,44,89,55,127,7,30,17,0
2,Taking online or watching video conferencing c...,1,take onlin watch video conferenc class number ...,742,104,0.140162,17,9,9,70,3,9,16,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,communityservic ye feel kid commun servic feel...,573,98,0.171030,39,63,22,38,8,26,21,1
4,Driverless cars would be a Wood idea. It would...,0,driverless car would wood idea would use less ...,619,104,0.168013,27,16,16,49,7,17,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,"Throughout Americas history, people have alway...",0,throughout america histori peopl alway own car...,1536,258,0.167969,62,67,42,143,9,37,38,0
9995,Some schools other distance learning as an opt...,0,school distanc learn option student attend cla...,674,105,0.155786,20,21,21,55,5,13,25,0
9996,"Hey there! So, you know how people always say...",1,hey know peopl alway say posit attitud super i...,847,146,0.172373,85,26,22,64,4,23,22,8
9997,"I agree that in twenty years, there will be fe...",1,agre twenti year fewer cab use abe today abe m...,964,161,0.167012,32,23,22,102,5,12,30,0


In [19]:
tool = LanguageTool('en-US')

In [20]:
# Text error length - grammer mistakes

def error_length(text):
    matches = tool.check(text)
    return len(matches)

df['error_length'] = df['cleaned_text'].apply(error_length)

KeyboardInterrupt: 

In [22]:
df.head()

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,verb_count,adj_count,pro_count
0,I think that animals should not be used in sci...,1,think anim use scientif research wrong use ben...,622,102,0.163987,26,15,15,59,10,11,16,1
1,Focus On The ROAS\n\nDo you think you should b...,0,focu roa think servic drive vehicl danger an p...,1120,196,0.175,44,89,55,127,7,30,17,0
2,Taking online or watching video conferencing c...,1,take onlin watch video conferenc class number ...,742,104,0.140162,17,9,9,70,3,9,16,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,communityservic ye feel kid commun servic feel...,573,98,0.17103,39,63,22,38,8,26,21,1
4,Driverless cars would be a Wood idea. It would...,0,driverless car would wood idea would use less ...,619,104,0.168013,27,16,16,49,7,17,20,0


In [23]:
# Named Entity Recognition (NER)

def ner_count(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ner_tags = ne_chunk(pos_tags)
    ner_count = sum(1 for chunk in ner_tags if hasattr(chunk, 'label'))
    return ner_count

df['ner_count'] = df['cleaned_text'].apply(ner_count)

In [24]:
# Readability score

df['flesch_kincaid_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
df['flesch_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['gunning_fog_score'] = df['cleaned_text'].apply(lambda x: textstat.gunning_fog(x))
df['coleman_liau_score'] = df['cleaned_text'].apply(lambda x: textstat.coleman_liau_index(x))
df['dale_chall_score'] = df['cleaned_text'].apply(lambda x: textstat.dale_chall_readability_score(x))
df['ari_score'] = df['cleaned_text'].apply(lambda x: textstat.automated_readability_index(x))
df['linsear_write_score'] = df['cleaned_text'].apply(lambda x: textstat.linsear_write_formula(x))
df['spache_score'] = df['cleaned_text'].apply(lambda x: textstat.spache_readability(x))

In [25]:
# Topic modeling - LDA

corpus = [text.split() for text in df['cleaned_text']]

dictionary = corpora.Dictionary(corpus)

corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Train LDA model
num_topics = 20
lda_model = LdaModel(corpus_bow, num_topics=num_topics, id2word=dictionary, passes=15)

# Extract topic
topic_distribution = lda_model.get_document_topics(corpus_bow)

for topic in range(num_topics):
    df[f'topic_{topic + 1}_score'] = [t[1] if t[0] == topic else 0 for t in topic_distribution]

In [26]:
df.head(-1)

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,...,topic_11_score,topic_12_score,topic_13_score,topic_14_score,topic_15_score,topic_16_score,topic_17_score,topic_18_score,topic_19_score,topic_20_score
0,I think that animals should not be used in sci...,1,think anim use scientif research wrong use ben...,622,102,0.163987,26,15,15,59,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,focu roa think servic drive vehicl danger an p...,1120,196,0.175000,44,89,55,127,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,take onlin watch video conferenc class number ...,742,104,0.140162,17,9,9,70,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,communityservic ye feel kid commun servic feel...,573,98,0.171030,39,63,22,38,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,driverless car would wood idea would use less ...,619,104,0.168013,27,16,16,49,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,"Throughout Americas history, people have alway...",0,throughout america histori peopl alway own car...,1536,258,0.167969,62,67,42,143,...,0,0,0,0,0,0,0,0,0,0
9995,Some schools other distance learning as an opt...,0,school distanc learn option student attend cla...,674,105,0.155786,20,21,21,55,...,0,0,0,0,0,0,0,0,0,0
9996,"Hey there! So, you know how people always say...",1,hey know peopl alway say posit attitud super i...,847,146,0.172373,85,26,22,64,...,0,0,0,0,0,0,0,0,0,0
9997,"I agree that in twenty years, there will be fe...",1,agre twenti year fewer cab use abe today abe m...,964,161,0.167012,32,23,22,102,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Count Vectorizer

count_vect = df['cleaned_text'].tolist()

count_vectorizer = CountVectorizer()

count_matrix = count_vectorizer.fit_transform(count_vect)

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

df = pd.concat([df, count_df], axis=1)


In [29]:
df.head(-1)

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,...,zxp,zxpirzmznt,zxpzzizncz,zxpzziznczd,zy,zygot,zzsouzcz,zzszazch,zzut,zzz
0,I think that animals should not be used in sci...,1,think anim use scientif research wrong use ben...,622,102,0.163987,26,15,15,59,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,focu roa think servic drive vehicl danger an p...,1120,196,0.175000,44,89,55,127,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,take onlin watch video conferenc class number ...,742,104,0.140162,17,9,9,70,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,communityservic ye feel kid commun servic feel...,573,98,0.171030,39,63,22,38,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,driverless car would wood idea would use less ...,619,104,0.168013,27,16,16,49,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,"Throughout Americas history, people have alway...",0,throughout america histori peopl alway own car...,1536,258,0.167969,62,67,42,143,...,0,0,0,0,0,0,0,0,0,0
9995,Some schools other distance learning as an opt...,0,school distanc learn option student attend cla...,674,105,0.155786,20,21,21,55,...,0,0,0,0,0,0,0,0,0,0
9996,"Hey there! So, you know how people always say...",1,hey know peopl alway say posit attitud super i...,847,146,0.172373,85,26,22,64,...,0,0,0,0,0,0,0,0,0,0
9997,"I agree that in twenty years, there will be fe...",1,agre twenti year fewer cab use abe today abe m...,964,161,0.167012,32,23,22,102,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Bigram Vectorizer

bigram_vect = count_vect

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)

bigram_matrix = bigram_vectorizer.fit_transform(bigram_vect)

bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

df = pd.concat([df, bigram_df], axis=1)

In [31]:
# Trigram Vectorizer

trigram_vect = count_vect

trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=5000)

trigram_matrix = trigram_vectorizer.fit_transform(trigram_vect)

trigram_df = pd.DataFrame(trigram_matrix.toarray(), columns=trigram_vectorizer.get_feature_names_out())

df = pd.concat([df, trigram_df], axis=1)

In [32]:
# Bi-Trigram Vectorizer

bitri_vect = count_vect

bitri_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)

bichar_matrix = bitri_vectorizer.fit_transform(bitri_vect)

bichar_df = pd.DataFrame(bichar_matrix.toarray(), columns=bitri_vectorizer.get_feature_names_out())

df = pd.concat([df, bichar_df], axis=1)


In [33]:
df.head(-1)

Unnamed: 0,text,label,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,...,zr,zs,zs.1,zt,zt.1,zy,zyg,zz,zz.1,zzl
0,I think that animals should not be used in sci...,1,think anim use scientif research wrong use ben...,622,102,0.163987,26,15,15,59,...,0,0,0,0,0,0,0,0,0,0
1,Focus On The ROAS\n\nDo you think you should b...,0,focu roa think servic drive vehicl danger an p...,1120,196,0.175000,44,89,55,127,...,0,0,0,0,0,0,0,0,0,0
2,Taking online or watching video conferencing c...,1,take onlin watch video conferenc class number ...,742,104,0.140162,17,9,9,70,...,0,0,0,0,0,0,0,0,0,0
3,COMMUNITY_SERVICE\n\nYES.! I feel that all kid...,0,communityservic ye feel kid commun servic feel...,573,98,0.171030,39,63,22,38,...,0,0,0,0,0,0,0,0,0,0
4,Driverless cars would be a Wood idea. It would...,0,driverless car would wood idea would use less ...,619,104,0.168013,27,16,16,49,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,"Throughout Americas history, people have alway...",0,throughout america histori peopl alway own car...,1536,258,0.167969,62,67,42,143,...,0,0,0,0,0,0,0,0,0,0
9995,Some schools other distance learning as an opt...,0,school distanc learn option student attend cla...,674,105,0.155786,20,21,21,55,...,0,0,0,0,0,0,0,0,0,0
9996,"Hey there! So, you know how people always say...",1,hey know peopl alway say posit attitud super i...,847,146,0.172373,85,26,22,64,...,0,0,0,0,0,0,0,0,0,0
9997,"I agree that in twenty years, there will be fe...",1,agre twenti year fewer cab use abe today abe m...,964,161,0.167012,32,23,22,102,...,0,0,0,0,0,0,0,0,0,0


# Saving the dataset

In [34]:
df.to_csv('DATA_SET/pre_final_test.csv', index=False)

KeyboardInterrupt: 

# Training the model

In [2]:
df = pd.read_csv('DATA_SET/pre_final_test.csv')

In [3]:
# Support Vector Machine (SVM)

y = df['label']

X = df.drop(['text', 'label', 'cleaned_text'], axis=1)
# X = df.drop(['text', 'label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86       583
           1       0.97      0.45      0.62       325

    accuracy                           0.80       908
   macro avg       0.87      0.72      0.74       908
weighted avg       0.84      0.80      0.78       908



In [4]:
# Random Forest Classifier

y = df['label']

# Features (excluding columns not used for training)
X = df.drop(['text', 'label', 'cleaned_text'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       583
           1       0.99      0.89      0.94       325

    accuracy                           0.96       908
   macro avg       0.96      0.94      0.95       908
weighted avg       0.96      0.96      0.96       908



# Saving the model

In [None]:
# joblib.dump(model, 'svm_model.joblib')

['svm_model.joblib']