### Problem Statement
- Implement in Python a
logistic regression model using a SMS Spam
Collection Dataset.

In [None]:
# importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore') # for ignoring the warnings

In [None]:
# Reading the dataset into a Dataframe
df = pd.read_csv('dataset.csv' , encoding='latin-1')
df.head()

- IMPROVING THE DATA

In [None]:
# Removing useless columns
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
# Renaming the useful columns
df = df.rename(columns={'v1':'label','v2':'message'})
df.head()

In [None]:
# label mapping as ham -> 0 , spam -> 1
df['label_num'] = df['label'].map({'ham':0 , 'spam':1})
# Adding a new Length column
df['Length'] = df['message'].apply(len)
df.head()

- VISUALISING THE DATA

In [None]:
# Number of spam and ham messages
df['label'].value_counts()

In [None]:
# Plotting a count Plot of label column
sns.countplot(df['label'], palette=sns.color_palette('Set2'))

In [None]:
# Plotting a count Plot of Length column
plt.figure(figsize=(15,8))
sns.countplot(df['Length'], palette=sns.color_palette('Set2'))

In [None]:
print("Average Length of a message is", round(df['Length'].mean()))
print("Standard deviation of length is", round(df['Length'].std()))

In [None]:
# Distribution and Mean length of spam messages
spam_len = df.loc[df["label_num"] == 1, "Length"]
plt.figure(figsize=(14,8))
sns.countplot(spam_len)

In [None]:
print("Average Length of a text of spam message is", round(spam_len.mean()))
print("Standard deviation of length of spam message is", round(spam_len.std()))

In [None]:
# Distribution and Mean length of ham messages
ham_len = df.loc[df["label_num"] == 0, "Length"]
plt.figure(figsize=(14,8))
sns.countplot(ham_len)

In [None]:
print("Average Length of a text of ham message is", round(ham_len.mean()))
print("Standard deviation of length of ham message is", round(ham_len.std()))

* The above visualisation and data gives the proof of the fact that spam messages are long and the difference between their size is very small
And for the ham messages, average length is comparitively small and standard deviation is huge, as we know some ham messages are big , some are small.

- NOW PREPROCESSING THE TEXT

In [None]:
# Removing Punctuation
def rem_punc(text):
    new_text = "".join([i for i in text if i not in string.punctuation])
    return new_text
df['message'] = df['message'].apply(lambda x: rem_punc(x))
df.head()

In [None]:
# Lower Case
df['message'] = df['message'].apply(lambda x: x.lower())
df.head()

- SPLITTING THE TRAIN AND TEST DATA

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=24)
print(train_df.shape)
print(test_df.shape)
train_df.to_csv('train_data.csv')
test_df.to_csv('test_data.csv')

In [None]:
# dataframe -> array
xtrain, ytrain = np.asanyarray(train_df['message']), np.asanyarray(train_df['label_num'])
xtest, ytest = np.asanyarray(test_df['message']), np.asanyarray(test_df['label_num'])
len(xtrain), len(xtest)

In [None]:
# Now we will use "CountVectorizer()" to transform a xtrain text data into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
# It is something like each term is assigned a weight based on how many times it appears in spam and ham messages. For instance, if “win big money prize” is one of our features and only appears in spam emails, then it will be given a larger probability of being spam.
from sklearn.feature_extraction.text import CountVectorizer
counter_vec = CountVectorizer().fit(xtrain)
xtrain_vec, xtest_vec = counter_vec.transform(xtrain), counter_vec.transform(xtest)

- MODEL CREATION

In [None]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()

- MODEL TRAINING

In [None]:
LR_model.fit(xtrain_vec,ytrain)

- MODEL TESTING

In [None]:
ypred = LR_model.predict(xtest_vec)
# Comparing original test results with predicted results
print(ypred[:100])
print(ytest[:100])

- ACCURACY

In [None]:
excep_count=0
for i,j in zip(ytest,ypred):
    if i!=j:
        excep_count=excep_count+1
print("Only",excep_count, "wrong predictions among", len(ytest),"test data")
from sklearn.metrics import accuracy_score
score = round(accuracy_score(ytest,ypred) * 100 , 2)
print("Accuracy of the Model is : ", score, "%")

- CLASSIFICATION REPORT

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred))

- PREDICTING BY GIVING A CUSTOM INPUT MESSAGE

In [None]:
inmessage = input()
lst=[]
lst.append(inmessage)
indf = pd.DataFrame(lst, columns=['in'])
intest = np.asanyarray(indf['in'])
intest_vec = counter_vec.transform(intest)
predlab = LR_model.predict(intest_vec)
if predlab[0]==1:
    print("this is a spam message !!!")
else:
    print("this is a ham message.")