In [1]:
import pandas as pd
import sqlite3
import regex as re
import matplotlib.pyplot as plt


In [4]:
df = pd.read_csv('emails.csv')

In [5]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import classification_report, accuracy_score

#list of sentences
text = ["the dog is white", "the cat is black", "the cat and the dog are friends"]

#instantiate the class
cv = CountVectorizer()

#tokenize and build vocab
cv.fit(text)

#summarize
print(cv.vocabulary_)

#encode document
vector = cv.transform(text)

#summarize encoded vector
print(vector.toarray())

{'the': 7, 'dog': 4, 'is': 6, 'white': 8, 'cat': 3, 'black': 2, 'and': 0, 'are': 1, 'friends': 5}
[[0 0 0 0 1 0 1 1 1]
 [0 0 1 1 0 0 1 1 0]
 [1 1 0 1 1 1 0 2 0]]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
text_vec = CountVectorizer().fit_transform(df['text'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_vec, df['spam'], test_size = 0.45
                                                    , random_state = 42, shuffle = True)

from sklearn import ensemble
classifier = ensemble.GradientBoostingClassifier(
    n_estimators = 100,
    learning_rate = 0.5,
    max_depth = 6,
)
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1942
           1       0.98      0.92      0.95       636

    accuracy                           0.98      2578
   macro avg       0.98      0.96      0.97      2578
weighted avg       0.98      0.98      0.98      2578



In [10]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train,pred))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2418
           1       1.00      1.00      1.00       732

    accuracy                           1.00      3150
   macro avg       1.00      1.00      1.00      3150
weighted avg       1.00      1.00      1.00      3150

Confusion Matrix: 
 [[2418    0]
 [   0  732]]

Accuracy:  1.0


In [11]:
pred = classifier.predict(X_test)
print(classification_report(y_test,pred))
print('Confusion Matrix: \n',confusion_matrix(y_test,pred))

print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1942
           1       0.98      0.92      0.95       636

    accuracy                           0.98      2578
   macro avg       0.98      0.96      0.97      2578
weighted avg       0.98      0.98      0.98      2578

Confusion Matrix: 
 [[1928   14]
 [  49  587]]

Accuracy:  0.9755624515128006


In [14]:
from textblob import TextBlob

email_blob = [TextBlob(text) for text in df['text']]

df['tb_Pol'] = [b.sentiment.polarity for b in email_blob]
df['tb_Subj'] = [b.sentiment.subjectivity for b in email_blob]

df.head(3)

Unnamed: 0,text,spam,tb_Pol,tb_Subj
0,Subject: naturally irresistible your corporate...,1,0.296607,0.546905
1,Subject: the stock trading gunslinger fanny i...,1,0.160317,0.562698
2,Subject: unbelievable new homes made easy im ...,1,0.040229,0.480581


In [13]:
!pip install -U textblob
!python -m textblob.download_corpora


Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     -------------------------------------- 636.8/636.8 kB 9.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\kavan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
