In [None]:
#import the packages that we need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [None]:
#read the data
df_job = pd.read_csv('fake_job_postings.csv')

df_job.head()

In [None]:
#replacing the NaN with blank string for NLP
df_job.replace(np.NaN, '',inplace = True)

In [None]:
#dropping the columns that we are not going to run through the NLP
df_job = df_job.drop(['location','department', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function'], axis = 1)

In [None]:
#checking that there are no missing values
df_job.info()

In [None]:
#splitting data into train and test data sets
y = df_job.fraudulent
df_job = df_job.drop('fraudulent', axis=1)

X_train, X_test, y_train, y_test = train_test_split(df_job['description'], y, test_size=0.33, random_state=53, stratify = y)

In [None]:
# Create bag-of-word vectors for the news articles
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[1,0])
print(cm)

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False,title='Confusion matrix', cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(cm, classes=["Fake", "Real"])
# metrics.plot_confusion_matrix(nb_classifier, X_test, y_test)
plt.show()