In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
import os
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

%matplotlib inline
plt.style.use('ggplot')

The code below is taken from the following website and has been modified for jupyter notebook: 

https://developers.google.com/gmail/api/quickstart/python

Please get your own authentication json before attempting to run the function.

In [24]:
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def get_gmail_messages():
    """Shows basic usage of the Gmail API.
    Lists the user's Gmail labels.
    """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('gmail', 'v1', credentials=creds)

    # Call the Gmail API
    results = service.users().messages().list(userId='me', includeSpamTrash=True ,maxResults=10).execute()
    
    id_arr = results.get('messages', [])
    
    message_arr = [get_message_snippet(service, id_dict) for id_dict in id_arr]
    
    service.close()
    
    return message_arr
       
def get_message_snippet(service, id_dict):
    return service.users().messages().get(userId='me',id=id_dict['id'],format='metadata').execute()['snippet']

The below code is to generate a pandas dataframe from the message snippets obtained from gmail API

In [25]:
email_dict = {'email':get_gmail_messages()}

email_dict

{'email': ['r/wallstreetbets: HERE COMES THE MONEY reddit r/wallstreetbets · Posted by u/doutorgama 5h ago HERE COMES THE MONEY Read More 48014 Votes 998 Comments Hide r/wallstreetbets r/politics · Posted by u/',
  'Hey Ryan Low, looks like you&#39;ve been playing CodeCombat with Python since June. If you are feeling stuck on The Final Kithmaze or just want to accelerate your coding skills, we&#39;re now offering',
  'Answer: I am just going to answer this question based on the past few intervie.\u200b.\u200b.\u200b Quora Ryan Low&#39;s Digest Top Stories For You Which has a better job market, Tableau or Power BI? Melvin Muhan',
  'Pleasure starts when you enter your home To ensure delivery to your inbox, please add crm.marketing@samsung.com to your address book. If you cannot view this email properly, please click here Image',
  'Quickstart was granted access to your Google Account kyzersoz@gmail.com If you did not grant access, you should check this activity and secure your account. 

In [16]:
email_df = pd.DataFrame(email_dict)
email_df

Unnamed: 0,email
0,r/wallstreetbets: HERE COMES THE MONEY reddit ...
1,"Hey Ryan Low, looks like you&#39;ve been playi..."
2,Answer: I am just going to answer this questio...
3,Pleasure starts when you enter your home To en...
4,Quickstart was granted access to your Google A...
5,Learn the latest data-driven best practices to...
6,Don&#39;t miss this MEGA EVENT of 2021 | View ...
7,"Hi Ryan, 💌You&#39;re invited to the Video Game..."
8,"USD 250000 allocation, Dear Valued Customer, C..."
9,"To unsubscribe to this free service, click her..."


I have taken the model below from my own spam prediction model: https://www.kaggle.com/ryanlowtianmun/spam-filter-exploration

Because of this, there is minimal data exploration and the code is intended to quickly build up the model to test against the gmail snippets.

In [6]:
df = pd.read_csv('Downloads/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [7]:
for index, rows in df.iterrows():
    for r in rows[2:]:
        if type(r) is str:
            df.loc[index,'v2'] = df.loc[index,'v2'] + ' ' + r
df = df.rename(columns={'v1': 'class','v2': 'text'})
df = df.replace('ham',0) 
df = df.replace('spam',1)
df['class'].astype('int')

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: class, Length: 5572, dtype: int32

In [8]:
X = df.drop('class',axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X['text'],y,test_size=0.2,random_state=42)

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

multi_nb = MultinomialNB()
multi_nb.fit(count_train,y_train)
y_pred = multi_nb.predict(count_test)
metrics.accuracy_score(y_test,y_pred)

0.9838565022421525

In [19]:
count_email = count_vectorizer.transform(email_df.email.values)
email_pred = multi_nb.predict(count_email)

In [20]:
email_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int64)

In [21]:
email_df

Unnamed: 0,email
0,r/wallstreetbets: HERE COMES THE MONEY reddit ...
1,"Hey Ryan Low, looks like you&#39;ve been playi..."
2,Answer: I am just going to answer this questio...
3,Pleasure starts when you enter your home To en...
4,Quickstart was granted access to your Google A...
5,Learn the latest data-driven best practices to...
6,Don&#39;t miss this MEGA EVENT of 2021 | View ...
7,"Hi Ryan, 💌You&#39;re invited to the Video Game..."
8,"USD 250000 allocation, Dear Valued Customer, C..."
9,"To unsubscribe to this free service, click her..."


In [22]:
email_df['prediction'] = email_pred

In [23]:
email_df

Unnamed: 0,email,prediction
0,r/wallstreetbets: HERE COMES THE MONEY reddit ...,0
1,"Hey Ryan Low, looks like you&#39;ve been playi...",0
2,Answer: I am just going to answer this questio...,0
3,Pleasure starts when you enter your home To en...,0
4,Quickstart was granted access to your Google A...,0
5,Learn the latest data-driven best practices to...,0
6,Don&#39;t miss this MEGA EVENT of 2021 | View ...,0
7,"Hi Ryan, 💌You&#39;re invited to the Video Game...",1
8,"USD 250000 allocation, Dear Valued Customer, C...",1
9,"To unsubscribe to this free service, click her...",1
