In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
import os

# Path to your locally saved file
file_path = 'spam.csv'  # Adjust if the file is not in the current directory

# Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: File {file_path} not found. Please check the path.")
else:
    print(f"File found: {file_path}")
    
    # Try different encodings if needed
    try:
        # First try with latin-1 encoding (common for this dataset)
        df = pd.read_csv(file_path, encoding='latin-1')
    except UnicodeDecodeError:
        # If that fails, try UTF-8
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            # If both fail, try another common encoding
            df = pd.read_csv(file_path, encoding='ISO-8859-1')
    
    # Display dataset info
    print("\nLoaded dataset successfully!")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    # Check types of columns
    print("\nColumn data types:")
    print(df.dtypes)
    
    # Identify and keep only necessary columns (usually 'v1' is the label and 'v2' is the message)
    # Common column patterns in this dataset:
    if 'v1' in df.columns and 'v2' in df.columns:
        print("\nUsing standard columns: v1 (label) and v2 (message)")
        df = df[['v1', 'v2']]
        df.columns = ['label', 'message']
    elif 'type' in df.columns.str.lower().tolist() and 'text' in df.columns.str.lower().tolist():
        # Some versions use 'type' and 'text'
        label_col = df.columns[df.columns.str.lower() == 'type'][0]
        message_col = df.columns[df.columns.str.lower() == 'text'][0]
        print(f"\nUsing columns: {label_col} (label) and {message_col} (message)")
        df = df[[label_col, message_col]]
        df.columns = ['label', 'message']
    else:
        # Try to identify label and message columns
        label_col = None
        message_col = None
        
        for col in df.columns:
            if df[col].dtype == 'object':
                # Sample some values from the column
                sample_values = df[col].dropna().astype(str).unique()
                
                # Check if this looks like a label column (contains 'spam' or 'ham')
                if any(v.lower() in ['spam', 'ham'] for v in sample_values):
                    label_col = col
                # Check if this looks like a message column (longer text)
                elif df[col].astype(str).str.len().mean() > 20:
                    message_col = col
        
        if label_col and message_col:
            print(f"\nAutomatically identified columns: {label_col} (label) and {message_col} (message)")
            df = df[[label_col, message_col]]
            df.columns = ['label', 'message']
        else:
            print("\nCould not automatically identify label and message columns.")
            print("Available columns:", df.columns.tolist())
            print("Please specify the columns manually in the code.")
    
    # Check for missing values
    print("\nMissing values in each column:")
    print(df.isnull().sum())
    
    # Drop rows with missing values if any
    df = df.dropna()
    print(f"Dataset shape after dropping nulls: {df.shape}")
    
    # Check unique values in label column to confirm format
    print("\nUnique values in label column:")
    print(df['label'].value_counts())
    
    # Convert labels to binary: ham = 0, spam = 1
    # Check if conversion is needed
    if df['label'].dtype == 'object':
        # Check if values are already in correct format
        if set(df['label'].unique()) == {0, 1}:
            print("\nLabels are already in binary format (0, 1)")
        else:
            # Map string labels to binary
            label_mapping = {}
            for val in df['label'].unique():
                if val.lower() == 'spam':
                    label_mapping[val] = 1
                else:  # Assuming everything else is 'ham'
                    label_mapping[val] = 0
            
            df['label'] = df['label'].map(label_mapping)
            print("\nConverted labels to binary format:")
            print(f"Mapping used: {label_mapping}")
    
    print("\nLabel distribution after conversion:")
    print(df['label'].value_counts())
    
    # Download stopwords if needed
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        print("\nDownloading NLTK stopwords...")
        nltk.download('stopwords', quiet=True)
        print("Download complete.")
    
    # Text cleaning function
    def clean_text(text):
        """
        Clean text by converting to lowercase, removing punctuation,
        numbers, and extra whitespaces
        """
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Apply cleaning to the message column
    print("\nCleaning message text...")
    df['clean_message'] = df['message'].apply(clean_text)
    
    # Display some cleaned messages
    print("\nCleaned message examples:")
    for i in range(min(5, len(df))):
        print(f"Original: {df.iloc[i]['message']}")
        print(f"Cleaned: {df.iloc[i]['clean_message']}")
        print("-" * 50)
    
    # Split into features and target
    X = df['clean_message']
    y = df['label']
    
    print(f"\nFeatures shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    
    # Save the cleaned dataset
    cleaned_path = 'cleaned_spam_dataset.csv'
    df.to_csv(cleaned_path, index=False)
    print(f"Cleaned dataset saved to '{cleaned_path}'")
    
    # Display class distribution
    print("\nClass distribution:")
    print(f"Ham (0): {(y == 0).sum()} messages ({(y == 0).sum() / len(y):.2%})")
    print(f"Spam (1): {(y == 1).sum()} messages ({(y == 1).sum() / len(y):.2%})")

File found: spam.csv

Loaded dataset successfully!
Dataset shape: (5572, 5)
Columns: ['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

First 5 rows:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Column data types:
v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object

Using standard columns: v1 (label) and v2 (message)

Missing values in each column:
label      0
message    0


# SMS Spam Detection: Data Preparation Methodology

## 1. File Loading & Validation
- **Multiple encoding support**: Text data often contains special characters requiring specific encoding protocols. Testing multiple encodings (latin-1, utf-8, ISO-8859-1) ensures reliable loading regardless of source encoding.
- **File existence check**: Validates data availability before processing, preventing mid-process failures.

## 2. Dataset Structure Analysis
- **Column identification**: SMS datasets may have inconsistent naming conventions (v1/v2, type/text). Intelligent column identification ensures processing works on any variant of the dataset.
- **Data type verification**: Confirms appropriate data types for each column, preventing type-related errors during processing.

## 3. Data Cleaning
- **Missing value handling**: Removes incomplete records to ensure model training on complete data points only, improving reliability.
- **Label standardization**: Converting text labels ('spam'/'ham') to binary format (1/0) is required for mathematical operations in machine learning algorithms.
  
## 4. Text Preprocessing
- **Case normalization**: Converting all text to lowercase eliminates duplicate features from case variations (e.g., "Free" vs "free").
- **Punctuation removal**: Punctuation rarely contributes to spam classification while increasing feature dimensionality.
- **Number removal**: Numbers in SMS messages are typically contextual and don't strongly indicate spam status independently.
- **Whitespace normalization**: Standardizes spacing to improve token extraction consistency.

## 5. Data Export
- **Creating cleaned dataset file**: Preserves the processed data for direct use in subsequent modeling steps without repeating preprocessing.
- **Class distribution analysis**: Documents the dataset balance, essential for selecting appropriate evaluation metrics and sampling techniques.

## Technical Justification
Each preprocessing step reduces noise while preserving signal, lowering dimensionality while maintaining classification-relevant information. This improves both model training efficiency and generalization performance on unseen data.

The approach follows established NLP preprocessing best practices while being specifically tailored to the spam detection domain, where linguistic patterns rather than individual numbers or special characters typically indicate spam content.