# 1. Data Cleaning


### 1.1 Importing the libraries

In [None]:
import numpy as np
import pandas as pd

### 1.2 Importing datasets

In [None]:
dataset = pd.read_csv('./spam.csv', encoding='ISO-8859-1')
dataset.sample(5)

### 1.3 Data cleaning

#### Delete last 3 cloumn

In [None]:
dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
dataset.sample(5)

#### Rename column

In [None]:
dataset.rename(columns={'v1':'Output','v2':'Text'},inplace=True)
dataset.sample(5)

### 1.4 Encoding categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
dataset['Output']=lb.fit_transform(dataset['Output'])
dataset.head()


### 1.5 Take care for Missing &&  Duplicate data


#### Checking missing data


In [None]:
dataset.isnull().sum()

#### Checking duplicatedata

In [None]:
dataset.duplicated().sum()

#### Removing Duplicate


In [None]:
dataset=dataset.drop_duplicates(keep='first')
dataset.duplicated().sum()

# 2. EDA

#### 2.1 Checking cartegory distribution

In [None]:
dataset['Output'].value_counts()


In [None]:
import matplotlib.pyplot as plt
plt.pie(dataset['Output'].value_counts(),labels=['ham','spam'], autopct="%0.2f")
plt.show()

#### 2.2 Checking data imbalance

count number of characters

In [None]:
import nltk

count number of words in each text

In [None]:
dataset['num_characters']=dataset['Text'].apply(len)
dataset.head()

In [None]:
dataset['num_words']=dataset['Text'].apply(lambda x:len(nltk.word_tokenize(x)))
dataset.head()

count number of sentences in each text

In [None]:
dataset['num_sentance']=dataset['Text'].apply(lambda x:len(nltk.sent_tokenize(x)))
dataset.head()

overall description of sapm and ham sms

In [None]:
dataset[['num_characters','num_words','num_sentance']].describe()

overall description of sapm 

In [None]:
#Spam
dataset[dataset['Output']==1][['num_characters','num_words','num_sentance']].describe()

Overall description of ham messages

In [None]:
#Ham
dataset[dataset['Output']==0][['num_characters','num_words','num_sentance']].describe()

In [None]:
import seaborn as sb
numeric_dataset = dataset.select_dtypes(include='number')
sb.heatmap(numeric_dataset.corr(),annot=True)

# 3. Data Preprocessing
        1.Lower case
        2.Tokenization
        3.Removing special characters
        4.Removing stop words
        5.stemming

In [None]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps =PorterStemmer()

def Transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text=y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
dataset['Transformed_Text']=dataset['Text'].apply(Transform_text)
dataset.head()

#### 3.1 formation of wordcloud


In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(dataset[dataset['Output'] == 1]['Transformed_Text'].str.cat(sep=" "))
plt.imshow(spam_wc)
plt.figure(figsize=(15,8))

In [None]:
ham_wc = wc.generate(dataset[dataset['Output'] == 0]['Transformed_Text'].str.cat(sep=" "))
plt.imshow(ham_wc)
plt.figure(figsize=(15,8))

# 4. Model building