In [4]:
import pandas as pd
import numpy as np

In [5]:
spam_data = pd.read_csv(r"C:\Users\phani\OneDrive\Trainings\spam.csv")
spam_data = spam_data.loc[:,:"Text"]

In [6]:
spam_data.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
spam_data_small_text = list(spam_data.head()["Text"])
spam_data_small_target = list(spam_data.head()["Target"])

In [9]:
spam_data_small_text

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though"]

## Bag of Words Approach

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
cv = CountVectorizer(stop_words='english', min_df=0.005)
cv.fit(list(spam_data["Text"]))
output = cv.transform(list(spam_data["Text"]))


tfidf = TfidfVectorizer(stop_words='english', min_df=0.005)
tfidf.fit(list(spam_data["Text"]))
output = tfidf.transform(list(spam_data["Text"]))

In [12]:
output = output.toarray()
columns = cv.get_feature_names()

X = pd.DataFrame(output, columns=columns)
y = spam_data["Target"]

### Create New Features
- Count of # Digits

In [13]:
# msg = "This is the 2nd time we have tried 2 contact u"
def fn_count_digits(msg):
    return sum(char.isdigit() for char in msg)

count_digits = spam_data["Text"].apply(lambda x : fn_count_digits(x))

In [14]:
X["count_digits"] = count_digits

### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Model Building

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [19]:
preds = clf.predict(X_test)

In [20]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1587
        spam       0.97      0.93      0.95       252

    accuracy                           0.99      1839
   macro avg       0.98      0.96      0.97      1839
weighted avg       0.99      0.99      0.99      1839



### Testing on sample examples

In [36]:
test_samples = ["WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift",
                "Hey gimme a call me on 78996797 when you are free. Need to talk something important"]

1. Use count vectorizer and convert to an array
2. Use the model and generate predictions

In [37]:
test_samples

['WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift',
 'Hey gimme a call me on 78996797 when you are free. Need to talk something important']

In [42]:
test_samples_output = tfidf.transform(test_samples)
test_samples_output = pd.DataFrame(test_samples_output.toarray(), columns = cv.get_feature_names())
test_samples_output['count_digits'] = [fn_count_digits(msg) for msg in test_samples]

In [43]:
test_samples_output.shape

(2, 287)

In [44]:
test_samples_preds = clf.predict(test_samples_output)
test_samples_probs = clf.predict_proba(test_samples_output)

In [45]:
print(test_samples_preds)
print(test_samples_probs)

['spam' 'ham']
[[0.26 0.74]
 [0.57 0.43]]


## Word Embeddings