In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
from sklearn.model_selection import train_test_split


In [None]:
from datetime import datetime
def save_plot(filename, folder='../reports/figures', fmt='png'): 
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, f"{filename}.{fmt}")
    fig = plt.gcf()
    fig.savefig(filepath, dpi=300)
    print(f"Saved: {filepath}")

### Set style 

In [None]:
plt.style.use('ggplot')
sns.set_palette("Set2")
%matplotlib inline


### preprocessing data


- importing data with encoding `iso-8859-1` because the file doesn't work with utf-8
- the orginal data has 5 columns total but the only `v1` and `v2` has the data so we need to delete the empty columns by selecting `v1` and `v2`
- changing the columns name for clarity 


### split data to train and test data

In [None]:
df = pd.read_csv('../data/raw/spam-sms.csv', encoding='iso-8859-1')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df = test_df[['v1', 'v2']]
test_df.columns = ['label', 'message']
test_df.to_csv('../data/processed/test_set.csv', index=True)
train_df = train_df[['v1', 'v2']]
train_df.columns = ['label', 'message']
train_df.to_csv('../data/processed/train_set.csv', index=True)

train_df

In [None]:
print("=== shape ===")
print(train_df.shape)

print("\n=== Missing values ===")
print(train_df.isnull().sum())

print("\n=== LABEL DISTRIBUTION ===")
print(train_df['label'].value_counts())

In [None]:
sns.set_style("whitegrid") 
sns.countplot(data=train_df, x='label', hue='label', palette='pastel')
plt.title("Class distribution: Spam vs Ham")
plt.xlabel("label")
plt.ylabel("Count")
save_plot("class_distribution")
plt.close()
plt.show()

In [None]:
# Add length column
train_df['length'] = train_df['message'].apply(len)

# Plot distribution of lengths by class
plt.figure(figsize=(10,6))
sns.histplot(data=train_df, x='length', hue='label', kde=True, palette='Set1')
plt.title("Message Length Distribution by Class")
plt.xlabel("Message Length (characters)")
plt.ylabel("Frequency")
save_plot('message_length_distrubtion')
plt.show()
plt.close()

# # Stats
# print("\n=== Average Length ===")
# print(train_df.groupby('label')['length'].mean())

In [None]:
train_df['exclamation_count'] = train_df['message'].apply(lambda x: x.count('!'))
train_df['uppercase_ratio'] = train_df['message'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x)> 0 else 0)

print("\n=== Average Exclamation Marks ===")
print(train_df.groupby('label')['exclamation_count'].mean())

print("\n=== Average Uppercase Ratio ===")
print(train_df.groupby('label')['uppercase_ratio'].mean())

### EDA 

In [None]:
from collections import Counter 
import re 

def get_words(text): 
    text = re.sub(r'[^a-zA-Z]', '', text.lower())
    return text.split() 


spam_words = []
ham_words = []

for _, row in train_df.iterrows(): 
    words = get_words(row['message'])
    if row['label'] == 'spam': 
        spam_words.extend(words)
    else: 
        ham_words.extend(words)


spam_counter = Counter(spam_words)
ham_counter = Counter(ham_words)


print("\n=== TOP 10 SPAM WORDS ===")
print(spam_counter.most_common(10))

print("\n=== TOP 10 HAM WORDS ===")
print(ham_counter.most_common(10))


In [None]:
from wordcloud import WordCloud

# Join all spam/ham messages
spam_text = ' '.join(train_df[train_df['label'] == 'spam']['message'])  # or == 1 if encoded
ham_text = ' '.join(train_df[train_df['label'] == 'ham']['message'])
# # Clean text for wordcloud (optional but better)
def clean_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', text.lower())

spam_text_clean = clean_text(spam_text)
ham_text_clean = clean_text(ham_text)


# # Generate word clouds
fig, ax = plt.subplots(1, 2, figsize=(16, 8))

wc_spam = WordCloud(width=800, height=400, background_color='white').generate(spam_text_clean)
ax[0].imshow(wc_spam, interpolation='bilinear')
ax[0].set_title("Spam Word Cloud", fontsize=20)
ax[0].axis("off")

wc_ham = WordCloud(width=800, height=400, background_color='white').generate(ham_text_clean)
ax[1].imshow(wc_ham, interpolation='bilinear')
ax[1].set_title("Ham Word Cloud", fontsize=20)
ax[1].axis("off")

save_plot("spam_ham_word_cloud")

plt.show()
# plt.close()

In [None]:
print("\n== Sample spam messages ==")
print(train_df[train_df['label'] == 'spam']['message'].sample(5).values)
print("\n== Sample ham messages ==")
print(train_df[train_df['label'] == 'ham']['message'].sample(5).values)


In [None]:
stats = train_df['label'].value_counts().to_frame()
stats['percentage'] = round((stats['count'] / len(train_df) ) * 100, 2)
stats

#### EDA insights - SMS spam Dataset 
- Total messages: 5,572
- Spam: 747 (13%) | ham:  13.41 (86.59%)
- Avg Spam Length: 138 chars | Avg Ham Length: 71 chars → Spam is longer
- Top Spam Words: free, win, prize, call, urgent, mobile, text, now, claim, reply
- Top Ham Words: ok, love, go, time, know, got, day, good, sorry, need
- Spam uses 3x more "!" and 2x more uppercase letters 