In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
messages = pd.read_csv('/content/drive/My Drive/Dataset/ML_LAB/SMSSpamCollection', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
clean_tweets = []
for sentence in messages['message']:
    sentence = re.sub("[^a-zA-Z0-9]"," ",str(sentence))
    sentence = re.sub(' +',' ',sentence) #extra spaces
    sentence = re.sub(r'\n',' ',sentence) #non breaking new line characters
    sentence = re.sub(r'[^\w\s]',' ',sentence)  #remove punctunations
    tokens = nltk.word_tokenize(sentence) #tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    #return tokens
    tokens = ' '.join(tokens)
    clean_tweets.append(tokens)
messages['clean_message'] = clean_tweets

In [None]:
X=messages.clean_message
from tensorflow.keras.utils import to_categorical
labels = [1 if x == "spam" else 0 for x in messages['label'].values]
messages['Actual Label'] = labels
labels = to_categorical(labels)
labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
messages=messages.dropna()
messages.head()

Unnamed: 0,label,message,clean_message,Actual Label
0,ham,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...,0
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...,1
3,ham,U dun say so early hor... U c already then say...,U dun say early hor U c already say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I think go usf life around though,0


In [None]:
X.shape

(5572,)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X, labels, test_size=0.25, random_state=25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,1))
X_new=tfidf_v.fit_transform(X_train).toarray()

In [None]:
print(X_new.shape)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, criterion = 'entropy')
model.fit(X_new,Y_train)

(4179, 5000)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
test_dataset = tfidf_v.transform(X_test)
predictions = model.predict(test_dataset)

In [None]:
xyz=X_test.to_numpy()
xyz

array(['sport fan get latest sport news str 2 ur mobile 1 wk FREE PLUS FREE TONE Txt SPORT ON 8007 www getzed co uk 0870141701216 norm 4txt 120p',
       'U reach orchard already U wan 2 go buy ticket first',
       'R u continent', ...,
       'Hey mr I going sea view couple gay I mean game Give bell ya finish',
       'Did u receive msg', 'So pay first lar Then da stock comin'],
      dtype=object)

In [None]:
df = pd.DataFrame(columns=["Cleaned Text", "Actual Labels","1-gram", "2-gram","3-gram"], index=range(1393))
df["Cleaned Text"]=xyz
df["Actual Labels"] = np.argmax(Y_test,axis=1)
df.head()

Unnamed: 0,Cleaned Text,Actual Labels,1-gram,2-gram,3-gram
0,sport fan get latest sport news str 2 ur mobil...,1,,,
1,U reach orchard already U wan 2 go buy ticket ...,0,,,
2,R u continent,0,,,
3,Where download clear movie Dvd copy,0,,,
4,Yes thought Thanks,0,,,


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
Y_test=np.argmax(Y_test,axis=1)
predictions=np.argmax(predictions,axis=1)
df["1-gram"] = predictions
matrix=confusion_matrix(Y_test,predictions)
print(matrix)
score=accuracy_score(Y_test,predictions)
print(score)
report=classification_report(Y_test,predictions)
print(report)

[[1194    0]
 [  32  167]]
0.9770279971284996
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1194
           1       1.00      0.84      0.91       199

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(2,2))
X_new1=tfidf_v.fit_transform(X_train).toarray()

In [None]:
print(X_new1.shape)
model.fit(X_new1,Y_train)
test_dataset1 = tfidf_v.transform(X_test)
predictions = model.predict(test_dataset1)

(4179, 5000)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
predictions=np.argmax(predictions,axis=1)
df["2-gram"] = predictions
matrix=confusion_matrix(Y_test,predictions)
print(matrix)
score=accuracy_score(Y_test,predictions)
print(score)
report=classification_report(Y_test,predictions)
print(report)

[[1193    1]
 [  55  144]]
0.9597989949748744
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1194
           1       0.99      0.72      0.84       199

    accuracy                           0.96      1393
   macro avg       0.97      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(3,3))
X_new2=tfidf_v.fit_transform(X_train).toarray()

In [None]:
model.fit(X_new2,Y_train)
test_dataset2 = tfidf_v.transform(X_test)
predictions = model.predict(test_dataset2)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
predictions=np.argmax(predictions,axis=1)
df["3-gram"] = predictions
matrix=confusion_matrix(Y_test,predictions)
print(matrix)
score=accuracy_score(Y_test,predictions)
print(score)
report=classification_report(Y_test,predictions)
print(report)

[[1194    0]
 [  93  106]]
0.9332376166547021
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1194
           1       1.00      0.53      0.70       199

    accuracy                           0.93      1393
   macro avg       0.96      0.77      0.83      1393
weighted avg       0.94      0.93      0.92      1393



In [None]:
df.to_csv('/content/drive/My Drive/Dataset/ML_LAB/output.csv', sep='\t')