In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import numpy as np
from bs4.element import Comment
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud


plt.style.use('fivethirtyeight')


In [81]:
def tag_visible(element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

class NaiveBayes():

    def __init__(self, df, stop_words):
        self.df = df
        self.train_cv = None
        self.train_count_matrix = None
        self.train_tfidf_matrix = None
        self.nb_model = None
        self.stop_words = stop_words


    def generate_train_test(self, train_size=0.75):
        self.y = self.df.pop('fraud').values
        self.X = self.df['desc_text'].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                                                                self.X, self.y, 
                                                                stratify=self.y, 
                                                                train_size=train_size,
                                                                random_state = 123)
    def join_list_of_strings(self, lst):
        """
        Joins the list into a string
        
        Params
        lst: list of words
        """
        return [" ".join(x) for x in lst]

    def remove_X_train_stops(self):
        lower = [x.lower() for x in self.X_train]
        split_lst = [txt.split() for txt in lower]
        self.stops_removed_lst = []

        for split in split_lst:
            stops = [w for w in split if w not in self.stop_words]
            stop_num = [w for w in stops if not (w.isdigit() 
                        or w[0] == '-' and w[1:].isdigit())]
            self.stops_removed_lst.append(stop_num)
        self.X_train = self.join_list_of_strings(self.stops_removed_lst)

    def remove_X_test_stops(self):
        lower = [x.lower() for x in self.X_test]
        split_lst = [txt.split() for txt in lower]
        self.stops_removed_lst = []

        for split in split_lst:
            stops = [w for w in split if w not in self.stop_words]
            stop_num = [w for w in stops if not (w.isdigit() 
                        or w[0] == '-' and w[1:].isdigit())]
            self.stops_removed_lst.append(stop_num)
        self.X_test = self.join_list_of_strings(self.stops_removed_lst)

    def tf_idf_matrix(self):
            """
            Sets up a word count matrix, a tfidf matrix, and a CountVectorizer for
            the documents in the directory

            Params
            documents: list of strings to be vectorized

            Returns
            count_matrix: matrix with word counts
            tfidf_matrix: a tfidf matrix of the documents
            cv: CountVectorizer object for the documents
            """
            self.train_cv = CountVectorizer()
            self.train_count_matrix = self.train_cv.fit_transform(self.X_train)
            tfidf_transformer = TfidfTransformer()
            self.train_tfidf_matrix = tfidf_transformer.fit_transform(self.train_count_matrix)

    def naive_bayes_model(self):
            """
            Sets up a naive bayes model for the documents in the directory

            Params
            directory: directory for the documents
            stop_words: list of stop_words for word filtration
            technique: technique: str choose from ['porter', 'snowball','wordnet']

            Returns
            nb_model: a naive bayes model for the documents in the directory
            cv: CountVectorizer object for the documents
            """
            self.nb_model = MultinomialNB()
            self.nb_model.fit(self.train_tfidf_matrix, self.y_train)

    def return_top_n_words(self, n=7):
            """
            Prints out the top n words for each document in the categories for the 
            documents in the directory

            Params
            directory: directory for the documents
            stop_words: list of stop_words for word filtration
            documents: a list of the categories (folders) in the directory
            technique: technique: str choose from ['porter', 'snowball','wordnet']

            """
            feature_words = self.train_cv.get_feature_names()
            categories = self.nb_model.classes_
            self.top_words_dic = {}
            for cat in range(len(categories)):
                print(f"\n Target: {cat}, name: {categories[cat]}")
                log_prob = self.nb_model.feature_log_prob_[cat]
                i_topn = np.argsort(log_prob)[::-1][:n]
                features_topn = [feature_words[i] for i in i_topn]
                self.top_words_dic[categories[cat]] = features_topn
                print(f"Top {n} tokens: ", features_topn)
    
    def get_accuracy_classification_report(self):
        """
        Prints out and returns the accuracy score from the prediction vs the actuals
        for the test set

        Params
        train_docs: list of strs used to train on
        test_docs: list of strs used to test
        test_targes: list of strs for the test target values

        Returns
        Accuracy score for the model
        """
        self.nb_pipeline = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('model', MultinomialNB()),
                            ])

        self.nb_pipeline.fit(self.X_train, self.y_train)
        self.predicted = self.nb_pipeline.predict(self.X_test)
        self.accuracy = np.mean(self.predicted == self.y_test)
        print("\nThe accuracy on the test set is {0:0.3f}.".format(self.accuracy))
        self.class_report = classification_report(self.y_test, self.predicted, 
                                                  digits=3,output_dict=True)
    def predic_probablility(self, X):
        return self.nb_pipeline.predict_proba(X)

In [82]:
df = pd.read_json('data/data.json')
descriptions = df.description
df['fraud'] = np.where((df['acct_type'] == 'fraudster') | 
                    (df['acct_type'] == 'fraudster_event') |
                    (df['acct_type'] == 'fraudster_att'), 1, 0)
descriptions = df.description
df['desc_text'] = 0
for i in range(df.shape[0]):
    df.iloc[i,45] = text_from_html(descriptions[i])


In [84]:
df_fraud = df[df['fraud'] == 1]
df_nf = df[df['fraud'] == 0]
df_nf_subset = df_nf.sample(n=df_fraud.shape[0], replace=False)
df_combined = df_fraud.append(df_nf_subset, ignore_index=True)

In [85]:
print(df_fraud.shape)
print(df_nf.shape)
print(df_nf_subset.shape)
print(df_combined.shape)

(1293, 46)
(13044, 46)
(1293, 46)
(2586, 46)


In [86]:
stop_words = stopwords.words('english')
extra_stops = ['tickets', '00']
stop_words.extend(extra_stops)

nb = NaiveBayes(df_combined, stop_words)
nb.generate_train_test()
nb.remove_X_train_stops()
nb.remove_X_test_stops()

test_counts_fraud = [x for x in nb.y_test if x == 1]
test_counts_notfraud = [x for x in nb.y_test if x == 0]
train_counts_fraud = [x for x in nb.y_train if x == 1]
train_counts_notfraud = [x for x in nb.y_train if x == 0]
print(len(test_counts_fraud)/len(test_counts_notfraud))
print(len(train_counts_fraud)/len(train_counts_notfraud))

0.9969135802469136
1.001031991744066


In [91]:
nb.tf_idf_matrix()
nb.naive_bayes_model()
nb.return_top_n_words()
nb.get_accuracy_classification_report()
probs = nb.predic_probablility(df['description'])
print(type(probs))
print(probs.shape)



 Target: 0, name: 0
Top 7 tokens:  ['event', 'business', '00', 'please', 'com', 'us', 'new']

 Target: 1, name: 1
Top 7 tokens:  ['de', 'event', 'party', 'conference', 'et', 'live', 'get']

The accuracy on the test set is 0.742.
<class 'numpy.ndarray'>
(14337, 2)


In [92]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 46 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   acct_type           14337 non-null  object 
 1   approx_payout_date  14337 non-null  int64  
 2   body_length         14337 non-null  int64  
 3   channels            14337 non-null  int64  
 4   country             14256 non-null  object 
 5   currency            14337 non-null  object 
 6   delivery_method     14321 non-null  float64
 7   description         14337 non-null  object 
 8   email_domain        14337 non-null  object 
 9   event_created       14337 non-null  int64  
 10  event_end           14337 non-null  int64  
 11  event_published     14238 non-null  float64
 12  event_start         14337 non-null  int64  
 13  fb_published        14337 non-null  int64  
 14  gts                 14337 non-null  float64
 15  has_analytics       14337 non-null  int64  
 16  has_

In [95]:
print(probs[:10])

[[0.57838133 0.42161867]
 [0.93469005 0.06530995]
 [0.77651694 0.22348306]
 [0.78722535 0.21277465]
 [0.89012939 0.10987061]
 [0.68672512 0.31327488]
 [0.49974214 0.50025786]
 [0.82177585 0.17822415]
 [0.58239931 0.41760069]
 [0.82177585 0.17822415]]


In [38]:
nb.return_top_n_words()

NameError: name 'self' is not defined