In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import string
import os

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
dataset=pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['LABEL','TEXT'])
print(dataset.shape)
dataset.drop_duplicates(inplace=True)
print(dataset.shape)
map={"ham":0,"spam":1}
dataset['LABEL']=dataset['LABEL'].apply(lambda a:map.get(a) if a in map else a)
print(dataset.head(10))

(5572, 2)
(5169, 2)
   LABEL                                               TEXT
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
5      1  FreeMsg Hey there darling it's been 3 week's n...
6      0  Even my brother is not like to speak with me. ...
7      0  As per your request 'Melle Melle (Oru Minnamin...
8      1  WINNER!! As a valued network customer you have...
9      1  Had your mobile 11 months or more? U R entitle...


In [None]:
lemmatizer=WordNetLemmatizer()
wordnet_map={"N":wordnet.NOUN,"V":wordnet.VERB,"J":wordnet.ADJ,"R":wordnet.ADV}
def process_text(text):
    text=text.lower() 
    words=[word for word in text if word not in string.punctuation]
    words=''.join(words)
    words=[word for word in words.split() if len(word)>2 and word.isalpha() and word not in stopwords.words('english')
    lemmatized_words=[lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words




text_count_transform= CountVectorizer(analyzer=process_text)
text_count=text_count_transform.fit_transform(dataset['TEXT'])
print(len(text_count_transform.vocabulary_))

7033


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_count.toarray(), dataset['LABEL'], test_size = 0.20, random_state = 0)

In [None]:
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train).reshape((y_train.shape[0],1))
y_test=np.array(y_test)


In [None]:
def add_ones(x):
  ones_array=np.ones((x.shape[0],1))
  return np.concatenate((ones_array,x),axis=1)

class LogisticRegressionUsingGD:

    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))

    def net_input(self,theta, x):
        return np.dot(x, theta)

    def probability(self, theta, x):
        return self.sigmoid(self.net_input(theta, x))

    def cost_function(self, theta, x, y):
        m = x.shape[0]
        total_cost = -(1 / m) * np.sum(
            y * np.log(self.probability(theta, x)) + (1 - y) * np.log(
                1 - self.probability(theta, x)))
        return total_cost

    def gradient(self, theta, x, y):
        m = x.shape[0]
        return (1 / m) * np.dot(x.T, self.sigmoid(self.net_input(theta, x)) - y)

    def fit(self, x, y, w,rate,num_iter):
        for i in range(num_iter):
            z=x@w
            sigmoid=1/(1+np.exp(-z))
            n= x.shape[0]
            gradient= (1 / n) * (x.T@(sigmoid-y))
            w-=(rate*gradient)
            '''if(abs(gradient).all()<0.0000001):
                print("................")
                print(i)
                print(w)
                self.w_ =w
                return self'''
        self.w_=w
        print(w)
        return self

    def predict(self, x):
        theta = self.w_
        return self.probability(theta, x)
    
    

    def accuracy(self, x, actual_classes, probab_threshold=0.5):
        predicted_classes = (self.predict(x) >= probab_threshold).astype(int)
        predicted_classes = predicted_classes.flatten()
        accuracy = np.mean(predicted_classes == actual_classes)
        print(predicted_classes)
        print(actual_classes)
        true_actual_predicted_true=0
        true_actual_predicted_false=0
        false_actual_predicted_true=0
        false_actual_predicted_false=0
        true_actual=0
        false_actual=0
        for i in range(len(predicted_classes)):
            if actual_classes[i]==1:
                true_actual+=1
            if actual_classes[i]==0:
                false_actual+=1
            if actual_classes[i]==1 and predicted_classes[i]==1:
                true_actual_predicted_true+=1
            if actual_classes[i]==1 and predicted_classes[i]==0:
                true_actual_predicted_false+=1
            if actual_classes[i]==0 and predicted_classes[i]==1:
                false_actual_predicted_true+=1
            if actual_classes[i]==0 and predicted_classes[i]==0:
                false_actual_predicted_false+=1
        print(true_actual,true_actual_predicted_true,true_actual_predicted_false)
        print(false_actual,false_actual_predicted_true,false_actual_predicted_false)
        return accuracy * 100

rate=0.1
num_iter=10000
X_train=add_ones(X_train)
X_test=add_ones(X_test)
w=np.zeros((X_train.shape[1],1))
model=LogisticRegressionUsingGD()
model.fit(X_train,y_train,w,rate,num_iter)
y_test=np.array(y_test)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
accuracy=model.accuracy(X_train,y_train.flatten())
print(accuracy)
accuracy=model.accuracy(X_test,y_test)
print(accuracy)

[[-3.70555097]
 [-0.01358107]
 [ 0.        ]
 ...
 [ 0.21335868]
 [-0.00907212]
 [-0.01445396]]
X_train (4135, 7034)
X_test (1034, 7034)
[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]
515 455 60
3620 3 3617
98.4764207980653
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
138 116 22
896 2 894
97.678916827853
