In [1]:
import pandas as pd
import spacy
from nltk.corpus import stopwords
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [2]:
# Load the dataset
df = pd.read_parquet("hf://datasets/karan842/ipc-sections/data/train-00000-of-00001.parquet")
df.head()

Unnamed: 0,Description,Offense,Punishment,Section
0,Description of IPC Section 140 According to se...,Wearing the dress or carrying any token used b...,3 Months or Fine or Both,IPC_140
1,Description of IPC Section 127 According to se...,Receiving property taken by war or depredation...,7 Years + Fine + forfeiture of property,IPC_127
2,Description of IPC Section 128 According to se...,Public servant voluntarily allowing prisoner o...,Imprisonment for Life or 10 Years + Fine,IPC_128
3,Description of IPC Section 129 According to se...,Public servant negligently suffering prisoner ...,Simple Imprisonment 3 Years + Fine,IPC_129
4,Description of IPC Section 130 According to se...,"Aiding escape of, rescuing or harbouring, such...",Imprisonment for Life or 10 Years + Fine,IPC_130


In [3]:
# Initialize NLP model and stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

In [4]:
# Step 1: Clean the description and section columns
def clean_description(text):
    # Remove "Description of IPC Section XXX" and similar text
    text = re.sub(r"Description of IPC Section \d{3}", "", text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    return text

df["Description"] = df["Description"].apply(clean_description)

df["Section"] = df["Section"].str.replace("IPC_", "", regex=False)

In [5]:
# Step 2: Normalize text
def normalize_text(text):
    if not text:
        return None
    # Standardize phrases and convert to lowercase
    text = text.replace("Life imprisonment", "life imprisonment")
    text = text.lower()  # Convert to lowercase
    return ' '.join(text.split())  # Remove extra spaces

# Apply normalization to relevant columns
text_columns = ["Description", "Offense", "Punishment"]  # Add other relevant columns if needed
for col in text_columns:
    df[col] = df[col].apply(normalize_text)

In [6]:
# Step 3: Select the order of columns
df_cleaned = df[["Section", "Description", "Offense", "Punishment"]]

In [7]:
# Step 4: Save the cleaned dataset
df_cleaned.to_csv("final_dataset.csv", index=False)

In [8]:
# Display output
print("Cleaned Dataset Preview:")
df_cleaned.head()

Cleaned Dataset Preview:


Unnamed: 0,Section,Description,Offense,Punishment
0,140,"according to section 140 of indian penal code,...",wearing the dress or carrying any token used b...,3 months or fine or both
1,127,"according to section 127 of indian penal code,...",receiving property taken by war or depredation...,7 years + fine + forfeiture of property
2,128,"according to section 128 of indian penal code,...",public servant voluntarily allowing prisoner o...,imprisonment for life or 10 years + fine
3,129,"according to section 129 of indian penal code,...",public servant negligently suffering prisoner ...,simple imprisonment 3 years + fine
4,130,"according to section 130 of indian penal code,...","aiding escape of, rescuing or harbouring, such...",imprisonment for life or 10 years + fine
