In [1]:
# importing modules

# data manupulation
import numpy as np
import pandas as pd
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# text cleaning
import string
import re
# nlp preprosessing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import chardet
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
print(result)
df = pd.read_csv('spam.csv', encoding=result['encoding'])


{'encoding': 'Windows-1252', 'confidence': 0.7257971165545478, 'language': ''}


In [3]:
df = pd.read_csv('spam.csv', encoding='Windows-1252')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
email = df['v2']
email

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

#### Text Cleaning

Steps to be followed

1. **Completeness** - check at missing values and handle them
2. **Consistency** - check for duplicate values
3. **Uniformity** - we will rename our columnsas well us cleaning our texts by removing punctuations capital letters 


In [5]:
import re  # Importing the re module for regular expressions

class DataCleaning:
    def __init__(self, df):
        self.df = df

    def missing_values(self):  # Checking for percentage of missing values in each column
        missing_values = self.df.isnull().sum()
        missing_percentage = self.df.isnull().mean() * 100
        return missing_percentage
    
    def drop_columns(self, columns):
        self.df = self.df.drop(columns=columns, axis=1)
        return self.df

    def change_col_name(self):  # Changing column names for uniformity
      self.df = self.df.rename(columns={'v1': 'email_type', 'v2': 'email'})
      return self.df
    
    def check_duplicates(self):
        duplicates = self.df.duplicated().sum()
        print(f"Number of duplicate rows: {duplicates}")


    @staticmethod
    def clean_email(email):  # Removing punctuations, uppercase, white spaces, 
        if isinstance(email, str):
            email = email.lower()
            email = re.sub(r'[^\w\s]', '', email)
            email = re.sub(r'\s+', ' ', email).strip()
            email = re.sub(r'@\w+', '', email)
        return email

   

**Completeness**

In [6]:
clean_df = DataCleaning(df)
missing_percentages = clean_df.missing_values()
missing_percentages

v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64

In [7]:
clean_df = DataCleaning(df)
df = clean_df.drop_columns(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])


**Consistency**

In [8]:
# Now call the check_duplicates method
clean_df.check_duplicates()

Number of duplicate rows: 403


In [9]:
def print_duplicate_values_and_columns(dataframe):
    for col in df.columns:
      duplicates = df[df.duplicated(subset=[col], keep=False)][col]
      if not duplicates.empty:
          print(f"Column: {col}")
          print(duplicates.value_counts())
          print()


duplicate = print_duplicate_values_and_columns(df)

duplicate

Column: v1
v1
ham     4825
spam     747
Name: count, dtype: int64

Column: v2
v2
Sorry, I'll call later                                                                                                                                                                                                                                                                                                                       30
I cant pick the phone right now. Pls send a message                                                                                                                                                                                                                                                                                          12
Ok...                                                                                                                                                                                                                                                  

In [10]:
#dropping the duplicate values
df.drop_duplicates()
#confirming duplicates dropped
df.shape

(5572, 2)

**Uniformity**

In [11]:
# # changing columns names by calling the class method
df = clean_df.change_col_name()


In [12]:

df['email_type'] = df['email_type'].replace({'ham':
                         'Not spam'})

In [13]:
# doing text cleaning  by removing @, punctuation ,uppercase by hep of our class method
df['cleaned_email'] = df['email'].apply(DataCleaning.clean_email)
df['cleaned_email'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: cleaned_email, dtype: object

**Text processing**

In [14]:

class TextProcessing:
    def __init__(self, df, column_name):
        self.df = df
        self.column_name = column_name
        self.sw = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    
    def tokenize_text(self):
        # Tokenize the text in the specified column
        self.df['tokenized'] = self.df[self.column_name].apply(word_tokenize)
        return self.df

    def remove_stopwords(self):
        # Remove stopwords from the tokenized text
        self.df['no_stopwords'] = self.df['tokenized'].apply(
            lambda x: [word for word in x if word not in self.sw]
        )
        return self.df

    def lemmatize_text(self):
        # Lemmatize the text
        self.df['lemmatized'] = self.df['no_stopwords'].apply(
            lambda x: [self.lemmatizer.lemmatize(word) for word in x]
        )
        return self.df



text_processor = TextProcessing(df, 'cleaned_email')







**Tokenization**

In [15]:
# Tokenize the text
df = text_processor.tokenize_text()
df['tokenized'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: tokenized, dtype: object

**Removing stop words**

In [16]:
df = text_processor.remove_stopwords()
df['no_stopwords'].head()


0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: no_stopwords, dtype: object

**Normallization**

Using **Lemmatization** 

In [17]:
df = text_processor.lemmatize_text()
df['lemmatized']

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, go, usf, life, around, though]
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å750, pou...
5568                  [ì_, b, going, esplanade, fr, home]
5569                      [pity, mood, soany, suggestion]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: lemmatized, Length: 5572, dtype: object

In [18]:
df

Unnamed: 0,email_type,email,cleaned_email,tokenized,no_stopwords,lemmatized
0,Not spam,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,Not spam,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,Not spam,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,Not spam,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]"
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,...","[2nd, time, tried, 2, contact, u, u, å750, pou...","[2nd, time, tried, 2, contact, u, u, å750, pou..."
5568,Not spam,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home,"[will, ì_, b, going, to, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]","[ì_, b, going, esplanade, fr, home]"
5569,Not spam,"Pity, * was in mood for that. So...any other s...",pity was in mood for that soany other suggestions,"[pity, was, in, mood, for, that, soany, other,...","[pity, mood, soany, suggestions]","[pity, mood, soany, suggestion]"
5570,Not spam,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like id ...,"[the, guy, did, some, bitching, but, i, acted,...","[guy, bitching, acted, like, id, interested, b...","[guy, bitching, acted, like, id, interested, b..."


In [19]:
#joinging the lemmatized email
df['lemmatized'] = df['lemmatized'].apply(lambda x: ' '.join(x))

In [20]:
processed_df = df