In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:


from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
import re

In [None]:
#Inspecting data
data = pd.read_csv('/content/gdrive/My Drive/nlp_proj/data/SMSSpamCollection', sep='\t', header=None, names=['label', 'msg_body'])
data.head()


Unnamed: 0,label,msg_body
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#punct and lower
def remove_spec_char(sentence):
    unwanted_chars = [ch for ch in string.punctuation]
    for char in unwanted_chars:
        sentence = sentence.replace(char, ' ')
    return sentence.lower()


In [None]:
data.iloc[:, 1] = data.apply(lambda row: remove_spec_char(row[1]), axis=1)
data.head()

Unnamed: 0,label,msg_body
0,ham,go until jurong point crazy available only ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives aro...


In [None]:
sw = nltk.corpus.stopwords.words("english")
def remove_stop_words(sentence):
    rem = []
    for word in sentence.split():
        if word not in sw:
            rem.append(word)
    return ' '.join(rem)


In [None]:
data.iloc[:, 1] = data.apply(lambda row: remove_stop_words(row[1]), axis=1)
data.head()

Unnamed: 0,label,msg_body
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though


In [None]:
def lemmatize(sentence):
    lemmatizer = WordNetLemmatizer()
    sentence = " ".join([lemmatizer.lemmatize(word) for word in sentence.split(" ")])
    return sentence


In [None]:
data.iloc[:, 1] = data.apply(lambda row: lemmatize(row[1]), axis=1)
data.head()

Unnamed: 0,label,msg_body
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [None]:
data = data.dropna()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


xTrain,xTest,yTrain,yTest=train_test_split(data.iloc[:,1],data.iloc[:,0],test_size=0.25)

vectorizer=TfidfVectorizer()

xTrain=vectorizer.fit_transform(xTrain)

### Model 1 - Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

model1=DecisionTreeClassifier()
model1.fit(xTrain,yTrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix

#print (xTest.shape[0],len(yTest))
xTest1=vectorizer.transform(xTest)
yPred1=model1.predict(xTest1)

print (confusion_matrix(yTest,yPred1))

[[1180   13]
 [  41  159]]


In [None]:
print ('Accuracy:', round(100*accuracy_score(yTest,yPred1),2))

Accuracy: 96.12


### Model 2 - Ensemble - Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model2=RandomForestClassifier()
model2.fit(xTrain,yTrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
xTest1=vectorizer.transform(xTest)
yPred=model2.predict(xTest1)

print (confusion_matrix(yTest,yPred))

[[1193    0]
 [  33  167]]


In [None]:
print ('Accuracy:', round(100*accuracy_score(yTest,yPred),2))

Accuracy: 97.63


USE MODEL

In [None]:
text = "VIT wins Golden Globe Award"

In [None]:
text = [lemmatize(remove_stop_words(remove_spec_char(text)))]
text = vectorizer.transform(text)

In [None]:
model1.predict(text)

array(['ham'], dtype=object)

In [None]:
model2.predict(text)

array(['ham'], dtype=object)