In [1]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')       # Downloading tokenizer data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alok_kumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alok_kumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df=pd.read_csv('spam.csv')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)

In [5]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['target']=le.fit_transform(df['target'])
df

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [6]:
#remove Duplicate
df = df.drop_duplicates(keep = 'first')
len(df)

5169

# feature engg

In [7]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

In [8]:
# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [9]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transformed_text'] = df['text'].apply(transform_text)


Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [30]:
y

array([0, 0, 1, ..., 0, 0, 0], shape=(5169,))

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

x_train, x_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [15]:
from sklearn.metrics import accuracy_score,precision_score
def train_models(model,x_train,y_train,x_test,y_test):
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

# training the indivisual models

In [19]:
acc_score=[]
precision=[]

for name,model in clfs.items():
    model_acc,model_precision=train_models(model,x_train,y_train,x_test,y_test)
    print()
    print("for:",name)
    print("Accuracy: ", model_acc)
    print("Precision: ", model_precision)

    acc_score.append(model_acc)
    precision.append(model_precision)


for: SVC
Accuracy:  0.9671179883945842
Precision:  0.9333333333333333

for: KNN
Accuracy:  0.9274661508704062
Precision:  1.0

for: NB
Accuracy:  0.9709864603481625
Precision:  0.9655172413793104

for: DT
Accuracy:  0.937137330754352
Precision:  0.9010989010989011

for: LR
Accuracy:  0.9632495164410058
Precision:  0.9629629629629629

for: RF
Accuracy:  0.9700193423597679
Precision:  0.9421487603305785

for: Adaboost
Accuracy:  0.9235976789168279
Precision:  0.8734177215189873

for: Bgc
Accuracy:  0.9622823984526112
Precision:  0.9024390243902439

for: ETC
Accuracy:  0.9709864603481625
Precision:  0.921875

for: GBDT
Accuracy:  0.9497098646034816
Precision:  0.93

for: xgb
Accuracy:  0.9690522243713733
Precision:  0.9568965517241379


In [27]:
import pandas as pd

results = pd.DataFrame({
    "Model": list(clfs.keys()),
    "Accuracy": acc_score,
    "Precision": precision
})
results

Unnamed: 0,Model,Accuracy,Precision
0,SVC,0.967118,0.933333
1,KNN,0.927466,1.0
2,NB,0.970986,0.965517
3,DT,0.937137,0.901099
4,LR,0.96325,0.962963
5,RF,0.970019,0.942149
6,Adaboost,0.923598,0.873418
7,Bgc,0.962282,0.902439
8,ETC,0.970986,0.921875
9,GBDT,0.94971,0.93


# voting classifier

In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('svc', svc),
        ('knc', knc),
        ('mnb', mnb),
        ('dtc', dtc),
        ('lrc', lrc),
        ('rfc', rfc),
        ('abc', abc),
        ('bc', bc),
        ('etc', etc),
        ('gbdt', gbdt),
        ('xgb', xgb)
    ],
    voting='hard'   # try 'soft' if all models support predict_proba
)

voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)
print("Voting Accuracy:", accuracy_score(y_test, y_pred))


Voting Accuracy: 0.9709864603481625


In [22]:
df['target'].value_counts()

target
0    4516
1     653
Name: count, dtype: int64

# working with stratafied data

In [32]:
from sklearn.model_selection import train_test_split

x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(
    X, y,
    test_size=0.20,
    random_state=2,
    stratify=y
)

acc_score_st=[]
precision_st=[]

for name,model in clfs.items():
    model_acc,model_precision=train_models(model,x_train_st, y_train_st, x_test_st, y_test_st)
    print()
    print("for:",name)
    print("Accuracy: ", model_acc)
    print("Precision: ", model_precision)

    acc_score_st.append(model_acc)
    precision_st.append(model_precision)

results_st = pd.DataFrame({
    "Model": list(clfs.keys()),
    "Accuracy": acc_score_st,
    "Precision": precision_st
})

results_st



for: SVC
Accuracy:  0.9796905222437138
Precision:  0.9583333333333334

for: KNN
Accuracy:  0.9313346228239845
Precision:  0.9838709677419355

for: NB
Accuracy:  0.9806576402321083
Precision:  0.9663865546218487

for: DT
Accuracy:  0.9342359767891683
Precision:  0.8315789473684211

for: LR
Accuracy:  0.9729206963249516
Precision:  0.9401709401709402

for: RF
Accuracy:  0.9816247582205029
Precision:  0.9745762711864406

for: Adaboost
Accuracy:  0.9187620889748549
Precision:  0.7640449438202247

for: Bgc
Accuracy:  0.9642166344294004
Precision:  0.8507462686567164

for: ETC
Accuracy:  0.9835589941972921
Precision:  0.9453125

for: GBDT
Accuracy:  0.9593810444874274
Precision:  0.9587628865979382

for: xgb
Accuracy:  0.971953578336557
Precision:  0.9322033898305084


Unnamed: 0,Model,Accuracy,Precision
0,SVC,0.979691,0.958333
1,KNN,0.931335,0.983871
2,NB,0.980658,0.966387
3,DT,0.934236,0.831579
4,LR,0.972921,0.940171
5,RF,0.981625,0.974576
6,Adaboost,0.918762,0.764045
7,Bgc,0.964217,0.850746
8,ETC,0.983559,0.945312
9,GBDT,0.959381,0.958763


In [40]:
results_st.rename(columns={'Accuracy':'accuracy_st','Precision':'precision_st'},inplace=True)

In [44]:
comparision_df=pd.concat([results,results_st[['accuracy_st','precision_st']]],axis=1)

# voting classifier with stratafied data

In [35]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Voting Classifier
voting_clf_st = VotingClassifier(
    estimators=[
        ('svc', svc),
        ('knc', knc),
        ('mnb', mnb),
        ('dtc', dtc),
        ('lrc', lrc),
        ('rfc', rfc),
        ('abc', abc),
        ('bc', bc),
        ('etc', etc),
        ('gbdt', gbdt),
        ('xgb', xgb)
    ],
    voting='hard'   # try 'soft' if all models support predict_proba
)

voting_clf_st.fit(x_train_st, y_train_st)
y_pred_st = voting_clf_st.predict(x_test_st)
print("Voting Accuracy:", accuracy_score(y_test_st, y_pred_st))


Voting Accuracy: 0.9816247582205029


In [36]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_clf = StackingClassifier(
    estimators=[
        ('svc', svc),
        ('knc', knc),
        ('mnb', mnb),
        ('dtc', dtc),
        ('lrc', lrc),
        ('rfc', rfc),
        ('abc', abc),
        ('bc', bc),
        ('etc', etc),
        ('gbdt', gbdt),
        ('xgb', xgb)
    ],
    final_estimator=LogisticRegression(),
    passthrough=False
)

stacking_clf.fit(x_train_st, y_train_st)
y_pred_st = stacking_clf.predict(x_test_st)
print("Stacking Accuracy:", accuracy_score(y_test_st, y_pred_st))


Stacking Accuracy: 0.9864603481624759


In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models import KeyedVectors
import numpy as np

# Load Google pre-trained Word2Vec model (once at the start, not inside the function!)
# Download from: https://code.google.com/archive/p/word2vec/
word2vec_path = "GoogleNews-vectors-negative300.bin"
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def transform_text_with_embedding(text):
    """
    Transforms text: lowercase, tokenize, remove stopwords/punctuations, stemming,
    then creates a sentence embedding using Google Word2Vec (average of word vectors).
    """
    ps = PorterStemmer()
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Keep only alphanumeric
    tokens = [word for word in tokens if word.isalnum()]

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english') 
              and word not in string.punctuation]

    # Apply stemming
    tokens = [ps.stem(word) for word in tokens]

    # Convert tokens to embeddings (only if present in Word2Vec vocab)
    embeddings = []
    for word in tokens:
        if word in w2v_model.key_to_index:  # check if word exists in Word2Vec vocab
            embeddings.append(w2v_model[word])

    if len(embeddings) == 0:
        # If no valid words, return a zero vector (300 dims in Google Word2Vec)
        return np.zeros(300)

    # Average of word vectors (common way to get sentence embedding)
    sentence_embedding = np.mean(embeddings, axis=0)

    return sentence_embedding
