# LLM Text Classification - Data Cleaning

In [1]:
# ignore warnings, which is not important
import warnings
warnings.filterwarnings('ignore')

In [13]:
# required modules
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# load the dataset from csv file
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,_id,text,generated
0,65759d3853d009e5e5a2cb96,Cars. Cars have been around since they became ...,0
1,65759d3853d009e5e5a2cb97,Transportation is a large necessity in most co...,0
2,65759d3853d009e5e5a2cb98,"""America's love affair with it's vehicles seem...",0
3,65759d3853d009e5e5a2cb99,How often do you ride in a car? Do you drive a...,0
4,65759d3853d009e5e5a2cb9a,Cars are a wonderful thing. They are perhaps o...,0


In [5]:
# remove extra features from dataset and contain only important features
df = df[['text', 'generated']]
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0


In [7]:
# remove symbols, extra spaces, numbers, alphanumberics etc. from text
# and convert into lower case


def clean_text(text):
    # convert the whole text into lowercase
    text = text.lower()
    # remove unwanted characters from text
    # patterns
    pattern1 = re.compile(r'[^a-zA-Z\s]')
    pattern2 = re.compile(r'\n')
    # replace matched characters with an empty string and space
    text = pattern1.sub('', text)
    cleaned_text = pattern2.sub(' ', text)
    return cleaned_text

df['text'] = df['text'].apply(clean_text)

In [8]:
# Tokenize the texts for removing stopwords
# convert text into tokens
def tokenize_text(text):
    words = word_tokenize(text)
    return words

df['text'] = df['text'].apply(tokenize_text)

In [9]:
# remove stopwords from tokenized text
def remove_stopwords(tokens):
    # english language stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df['text'] = df['text'].apply(remove_stopwords)

In [10]:
# stem the tokens
def stem_tokens(tokens):
    # porter stemmer 
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

df['text'] = df['text'].apply(stem_tokens)

In [11]:
# filtered text after above cleaning processes
def filtered_text(tokens_list):
    text = ' '.join(tokens_list)
    return text

df['text'] = df['text'].apply(filtered_text)

In [12]:
# dataset after cleaning
df.head(10)

Unnamed: 0,text,generated
0,car car around sinc becam famou henri ford cre...,0
1,transport larg necess countri worldwid doubt c...,0
2,america love affair vehicl seem cool say elisa...,0
3,often ride car drive one motor vehicl work sto...,0
4,car wonder thing perhap one world greatest adv...,0
5,electrol colleg system unfair system peopl don...,0
6,dear state senat utmost respect ask method pre...,0
7,fellow citizen car becom major role daili live...,0
8,offici elector colleg unfair outdat irrat plum...,0
9,elector colleg kept centuri establish found fa...,0


In [14]:
# save the cleaned data to disk

# go to one directory back or root directory
os.chdir('../data/')
# save the data to disk
df.to_csv(os.path.join(os.getcwd(), 'filtered_data.csv'), index=False)