# Text Preprocessing

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("saurabhshahane/twitter-sentiment-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\rahil\.cache\kagglehub\datasets\saurabhshahane\twitter-sentiment-dataset\versions\1


In [2]:
import pandas as pd
import os

# Define the new dataset path
dataset_path = r"C:\Users\rahil\.cache\kagglehub\datasets\saurabhshahane\twitter-sentiment-dataset\versions\1"

# List available files in the dataset folder
print("Files in dataset folder:", os.listdir(dataset_path))

# Load the dataset (update the filename based on the actual dataset)
file_name = "Twitter_Data.csv"  # Change this if needed
data = pd.read_csv(os.path.join(dataset_path, file_name))

  from pandas.core import (


Files in dataset folder: ['Twitter_Data.csv']


In [3]:
data.rename(columns={"clean_text": "text"}, inplace=True)

In [4]:
# Display the first few rows
data.head()

Unnamed: 0,text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [5]:
# Drop the 'category' column
#data.drop(columns=["category"], inplace=True)

In [6]:
# Create a new column 'clean_text' as a copy of 'text'
data["clean_text"] = data["text"]

In [7]:
# Reorder columns to move 'category' to the end
data = data[["text", "clean_text", "category"]]

# Display the rows 
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,answer who among these the most powerful world...,1.0


## Removal of punctuations

In [12]:
import string
import re
import unicodedata

print(string.punctuation)  # Displays all punctuation characters

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
# Remove all punctuation using regex
# data["clean_text"] = data["clean_text"].apply(lambda x: re.sub(r"[^\w\s]", "", str(x)))

### 🔹 Why do we need both ASCII & Unicode removal?
- `string.punctuation` **removes only simple symbols** (like `! , . ?`).
- `unicodedata.category(char).startswith("P")` **removes all special punctuation**, including curly quotes and foreign symbols.


In [14]:
# Define function to remove all punctuation, including special Unicode characters
def remove_punctuation(text):
    text = str(text)  # Ensure text is a string
    text = "".join(char for char in text if char not in string.punctuation)  # Remove ASCII punctuations
    text = "".join(char for char in text if not unicodedata.category(char).startswith("P"))  # Remove all Unicode punctuations
    return text

In [15]:
# Apply function to clean_text column
data["clean_text"] = data["clean_text"].apply(remove_punctuation)

In [17]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,answer who among these the most powerful world...,1.0


## Removal of Stopwords

In [18]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already installed
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
# Checking the stopwords

stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [21]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Get the list of stopwords
    words = text.split()  # Split text into words
    filtered_text = " ".join(word for word in words if word.lower() not in stop_words)  # Remove stopwords
    return filtered_text

In [22]:
data["clean_text"] = data["clean_text"].apply(remove_stopwords)

In [23]:
data

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,modi promised minimum government maximum gover...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama vote modi,0.0
2,what did just say vote for modi welcome bjp t...,say vote modi welcome bjp told rahul main camp...,1.0
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names modi ...,1.0
4,answer who among these the most powerful world...,answer among powerful world leader today trump...,1.0
...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,456 crores paid neerav modi recovered congress...,-1.0
162976,dear rss terrorist payal gawar what about modi...,dear rss terrorist payal gawar modi killing 10...,-1.0
162977,did you cover her interaction forum where she ...,cover interaction forum left,0.0
162978,there big project came into india modi dream p...,big project came india modi dream project happ...,0.0


## Removal of Frequent words

In [29]:
from collections import Counter

def remove_frequent_words(data, column, top_n=10):
    all_words = " ".join(data[column]).split()  # Flatten all words into a list
    word_freq = Counter(all_words)  # Count frequency of words
    common_words = set(word for word, _ in word_freq.most_common(top_n))  # Get top N frequent words
    
    # Remove frequent words from the text
    data[column] = data[column].apply(lambda text: " ".join(word for word in text.split() if word not in common_words))
    
    return word_freq.most_common(top_n)  # Return top N frequent words

In [41]:
# Check the top 10 most frequent words after removal
all_words = " ".join(data["clean_text"]).split()
word_freq = Counter(all_words)
word_freq.most_common(10)  # Display new top 10 words

[('elections', 4176),
 ('chowkidar', 4167),
 ('never', 4129),
 ('2019', 4084),
 ('work', 4012),
 ('opposition', 3989),
 ('support', 3977),
 ('new', 3823),
 ('many', 3802),
 ('today', 3785)]

In [43]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promised minimum maximum governance expected b...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama,0.0
2,what did just say vote for modi welcome bjp t...,welcome told main campaigner relax,1.0
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names great...,1.0
4,answer who among these the most powerful world...,answer among powerful world leader today trump...,1.0


## Removal of Rare words

🔹 **Why Remove Rare Words?**  
- They may be **typos or misspellings** (e.g., `"modiij"`, `"democrcy"`).  
- They could be **random noise or outliers** (e.g., `"zxxqv"`, `"aaiwlfja"`).  
- They might be **too specific** (e.g., `"parliament_session_1998"`).  
- They could be **foreign words** in an English dataset (e.g., `"पॉलिटिक्स"`, `"政党"`).  

🔹 **How to Identify?**  
- Words that appear **very few times** in a large dataset (e.g., 1-2 times in 100,000 words).

In [49]:
def remove_rare_words(data, column, min_freq=2):
    from collections import Counter
    
    # Get word frequencies
    all_words = " ".join(data[column]).split()
    word_freq = Counter(all_words)
    
    # Keep only words that appear at least `min_freq` times
    filtered_texts = []
    for text in data[column]:
        filtered_texts.append(" ".join(word for word in text.split() if word_freq[word] >= min_freq))
    
    # Update the column
    data[column] = filtered_texts
    return data

# Remove words that appear only once (rare words)
data = remove_rare_words(data, "clean_text", min_freq=2)



In [50]:
data

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promised minimum maximum governance expected b...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama,0.0
2,what did just say vote for modi welcome bjp t...,welcome told main campaigner relax,1.0
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names great...,1.0
4,answer who among these the most powerful world...,answer among powerful world leader today trump...,1.0
...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,456 crores paid neerav recovered leaders hard ...,-1.0
162976,dear rss terrorist payal gawar what about modi...,dear rss terrorist payal gawar killing 1000 pl...,-1.0
162977,did you cover her interaction forum where she ...,cover interaction forum left,0.0
162978,there big project came into india modi dream p...,big project came dream project happened reality,0.0


### Re-checking if all rare words were removed or not..

In [51]:
from collections import Counter

# Recalculate word frequencies after removal
all_words_after = " ".join(data["clean_text"]).split()
word_freq_after = Counter(all_words_after)

# Find words that appear less than `min_freq` times
rare_words_remaining = {word: count for word, count in word_freq_after.items() if count < 2}

# Display rare words (if any)
print(rare_words_remaining)


{}


In [52]:
min_count = min(word_freq_after.values())  # Get the lowest word count
print(f"Minimum word frequency after removal: {min_count}")

Minimum word frequency after removal: 2


## Remove Special Characters

In [53]:
import re

def remove_special_characters(data, column):
    data[column] = data[column].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x))
    return data

# Apply the function
data = remove_special_characters(data, "clean_text")


In [55]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promised minimum maximum governance expected b...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama,0.0
2,what did just say vote for modi welcome bjp t...,welcome told main campaigner relax,1.0
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names great...,1.0
4,answer who among these the most powerful world...,answer among powerful world leader today trump...,1.0


## Stemming

In [56]:
from nltk.stem import PorterStemmer

def apply_stemming(data, column):
    stemmer = PorterStemmer()
    data[column] = data[column].apply(lambda x: " ".join(stemmer.stem(word) for word in x.split()))
    return data

# Apply Stemming
data = apply_stemming(data, "clean_text")


In [57]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promis minimum maximum govern expect begin dif...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsens continu drama,0.0
2,what did just say vote for modi welcome bjp t...,welcom told main campaign relax,1.0
3,asking his supporters prefix chowkidar their n...,ask support prefix chowkidar name great servic...,1.0
4,answer who among these the most powerful world...,answer among power world leader today trump pu...,1.0


In [58]:
from nltk.stem import WordNetLemmatizer

def apply_lemmatization(data, column):
    lemmatizer = WordNetLemmatizer()
    data[column] = data[column].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))
    return data

# Apply Lemmatization
data = apply_lemmatization(data, "clean_text")

In [59]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promis minimum maximum govern expect begin dif...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsens continu drama,0.0
2,what did just say vote for modi welcome bjp t...,welcom told main campaign relax,1.0
3,asking his supporters prefix chowkidar their n...,ask support prefix chowkidar name great servic...,1.0
4,answer who among these the most powerful world...,answer among power world leader today trump pu...,1.0


## Remove URLs

In [60]:
import re

def remove_urls(data, column):
    url_pattern = r"https?://\S+|www\.\S+"  # Matches URLs starting with http, https, or www
    data[column] = data[column].apply(lambda x: re.sub(url_pattern, "", x))  # Replace URLs with an empty string
    return data

# Apply URL Removal
data = remove_urls(data, "clean_text")

In [61]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promis minimum maximum govern expect begin dif...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsens continu drama,0.0
2,what did just say vote for modi welcome bjp t...,welcom told main campaign relax,1.0
3,asking his supporters prefix chowkidar their n...,ask support prefix chowkidar name great servic...,1.0
4,answer who among these the most powerful world...,answer among power world leader today trump pu...,1.0


## Removal of HTML Tags

In [62]:
import re

def remove_html_tags(data, column):
    html_pattern = r"<.*?>"  # Matches anything inside < >
    data[column] = data[column].apply(lambda x: re.sub(html_pattern, "", x))  # Remove HTML tags
    return data

# Apply HTML Tag Removal
data = remove_html_tags(data, "clean_text")


In [72]:
data.head()

Unnamed: 0,text,clean_text,category
0,when modi promised “minimum government maximum...,promis minimum maximum govern expect begin dif...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsens continu drama,0.0
2,what did just say vote for modi welcome bjp t...,welcom told main campaign relax,1.0
3,asking his supporters prefix chowkidar their n...,ask support prefix chowkidar name great servic...,1.0
4,answer who among these the most powerful world...,answer among power world leader today trump pu...,1.0


## Spell checker