<a href="https://colab.research.google.com/github/hikmat690/AI-programming/blob/main/labtask11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
with open('/content/kaggle.json') as f:
    kaggle_json = json.load(f)
    os.environ['KAGGLE_USERNAME'] = kaggle_json['username']
    os.environ['KAGGLE_KEY'] = kaggle_json['key']

In [None]:
!kaggle datasets download  uciml/sms-spam-collection-dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 78.6MB/s]


In [None]:
!unzip /content/sms-spam-collection-dataset.zip

Archive:  /content/sms-spam-collection-dataset.zip
  inflating: spam.csv                


In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('/content/spam.csv', encoding='latin1') # Change the encoding to 'latin1'
# latin1 (also known as 'ISO-8859-1'). This encoding is known to handle a wider
# range of characters and is often used for CSV files generated on Windows systems.

In [None]:
data.shape

(5572, 5)

In [None]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [None]:
data = data[['v1','v2']]
data= data.rename(columns={'v1':'Target','v2':'Message'})
data.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.Message[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [None]:
data.isnull().sum()

Unnamed: 0,0
Target,0
Message,0


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove stopwords
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)
data['clean_text'] = data['clean_text'].apply(remove_html_tags)

In [None]:
data['clean_text'] = data['Message'].apply(clean_text)
data.head()

Unnamed: 0,Target,Message,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


# **Tokenization**

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize,sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
sentence = 'I am Qaiser Farooq'
word_tokenize(sentence)

['I', 'am', 'Qaiser', 'Farooq']

In [None]:
data['clean_text'][3]

'u dun say early hor u c already say'

In [None]:
data['clean_text'] = data['clean_text'].apply(word_tokenize)

In [None]:
data['clean_text'][3]

['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say']

In [None]:
data['clean_text']

Unnamed: 0,clean_text
0,"[go, jurong, point, crazy, available, bugis, n..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,"[u, dun, say, early, hor, u, c, already, say]"
4,"[nah, think, goes, usf, lives, around, though]"
...,...
5567,"[2nd, time, tried, 2, contact, u, u, 750, poun..."
5568,"[b, going, esplanade, fr, home]"
5569,"[pity, mood, suggestions]"
5570,"[guy, bitching, acted, like, interested, buyin..."


# **Stamming**

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
sample1 = "The leaves are falling and the children are running towards the park."
stem_words(sample1)

'the leav are fall and the children are run toward the park.'

# **Lemmitization**

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmitizer = WordNetLemmatizer()
def lemmitize_words(text):
    return " ".join([lemmitizer.lemmatize(word,pos='v') for word in text.split()])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
sample3 = "The leaves are falling and the children are running towards the park ran."
lemmitize_words(sample3)

'The leave be fall and the children be run towards the park ran.'

In [None]:
def lemmatize_words(tokens):
    return [lemmitizer.lemmatize(word,pos='v') for word in tokens]
    #return " ".join([lemmatizer.lemmatize(word) for word in tokens])
# Lemmatizing the tokenized words in the 'review' column
data['lemmatized_review'] = data['clean_text'].apply(lemmatize_words)

In [None]:
data['lemmatized_review'][1]

['ok', 'lar', 'joke', 'wif', 'u', 'oni']

# **ML Model**

In [None]:
X = data['clean_text']
y = data['Target']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

def classify(model, X, y):
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    # model training
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])
    pipeline_model.fit(x_train, y_train)

    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)

#     cv_score = cross_val_score(model, X, y, cv=5)
#     print("CV Score:", np.mean(cv_score)*100)
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy: 96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

Accuracy: 97.48743718592965
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1206
        spam       1.00      0.81      0.90       187

    accuracy                           0.97      1393
   macro avg       0.99      0.91      0.94      1393
weighted avg       0.98      0.97      0.97      1393



In [None]:
from sklearn.svm import SVC
model = SVC(C=3)
classify(model, X, y)

Accuracy: 98.27709978463747
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       1.00      0.87      0.93       187

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393

