# Clickbait or Not?

The main goal of this project is to classify, using Natural Language Processing, the titles that are considered clickbait

### 1 Preprocessing

#### 1.1 Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### 1.2 Reading the datasets

In [None]:
click_bait = pd.read_csv('clickbait_titles.csv')
serious_titles = pd.read_csv('non_clickbait_titles.csv')

#### 1.3 Adding the labels
---------------------------
#####       1 for clickbait titles
#####       0 for not-clickbait titles

In [None]:
click_bait['label'] = 1
serious_titles['label'] = 0

####  1.4 Removing useless columns

In [None]:
click_bait = click_bait[['title', 'label']]
serious_titles = serious_titles[['title', 'label']]

In [None]:
display(click_bait.head())
display(serious_titles.head())

In [None]:
print('Items {Clickbait}: ', click_bait.shape[0])
print('Items {Not-Clickbait}: ', serious_titles.shape[0])

#### 1.5 Concatenating both datasets in one

In [None]:
titles = pd.concat([click_bait, serious_titles], axis = 0, ignore_index = True)
titles.to_csv('clickbait_or_not.csv', encoding = 'utf-8')

In [None]:
titles.shape

#### 1.6 Defining the feature and the label variables

In [None]:
X = titles['title']
y = titles['label']

#### 1.7 Splitting the dataset into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
print('tamanho do X_train: ', X_train.shape, '\t', 'tamanho do y_train: ', y_train.shape)
print('---------------------------------------------------------------------------')
print('tamanho do X_test: ', X_test.shape, '\t', 'tamanho do y_test: ', y_test.shape)

### 2 Creating the models

#### 2.1 Assigning the models

In [None]:
multinomial_clf = Pipeline([('cv', CountVectorizer()),
                     ('clf', MultinomialNB())])

complement_clf = Pipeline([('cv', CountVectorizer()),
                     ('clf', ComplementNB())])

svm_clf = Pipeline([('cv', CountVectorizer()),
                     ('clf', svm.SVC(kernel = 'linear'))])

#### 2.2 Training the models

In [None]:
multinomial_clf.fit(X_train, y_train)
complement_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

#### 2.3 Predicting the test points

In [None]:
pred_mult = multinomial_clf.predict(X_test)
pred_complement = complement_clf.predict(X_test)
pred_svm = svm_clf.predict(X_test)

### 3 Checking the precision of each model

#### 3.1  F1 Score

In [None]:
print('MultinomialNB Score:', str(round(f1_score(pred_mult, y_test), 4) * 100)+'%')
print('ComplementNB Score:', str(round(f1_score(pred_complement, y_test), 4) * 100)+'%')
print('SVM Score:', str(round(f1_score(pred_svm, y_test), 4) * 100)+'%')

#### 3.2  Confusion Matrix

In [None]:
sns.heatmap(confusion_matrix(y_test, pred_mult), annot = True, fmt = '.2f')
plt.title('MultinomialNB')
plt.show()

#### 3.3 Precision/Recall/F1 Score

In [None]:
print(classification_report(y_test, pred_mult, digits = 3))

#### 3.4 Accuracy Score

In [None]:
print('Accuracy Score:', accuracy_score(y_test, pred_mult), )

#### 3.5 ROC/AUC

In [None]:
fpr, tpr, treshold = roc_curve(y_test, pred_mult)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### 4 Validating with recent titles

In [None]:
val = ['Escolha um ídolo do k-pop e nós indicaremos seu produto de beleza ideal',
        'Senador gay coloca Augusto Aras contra a parede: "Tenho subfamília? Sou doente?"',
        'Suas opiniões sobre estas tendências atuais da moda vão nos dizer se você faz parte da geração millennial ou da geração Z',
        '“Outras Ágathas virão”, diz oposição sobre pacote de Moro',
        'Planeje sua invasão à Área 51 e descubra qual E.T. você vai encontrar por lá',
        '10 livros com cartas de amor de gente meio passional',
        '“Recomendo que procure ajuda psiquiátrica”, diz Gilmar Mendes sobre Janot',
        'O pior é ter que concordar com o Gilmar, dizem aliados que romperam com Janot',
        'Cheesecake nunca é demais!',
        'O Facebook confirmou que políticos podem publicar o que quiserem, seja falso ou não']

val_y = [1, 0, 1, 0, 1, 1, 0, 0, 1, 0]

In [None]:
val_pred = multinomial_clf.predict(val)
val_pred

In [None]:
print('Validation Score: ', str(f1_score(val_pred, val_y) * 100)+'%')

### 5 Model persistance

#### 5.1 Saving the model

In [None]:
save = joblib.dump(multinomial_clf, 'Naive Bayes Buzzfeed Classifier.pkl')

#### 5.2 Loading the model

In [None]:
loaded_model = joblib.load('Naive Bayes Buzzfeed Classifier.pkl')

#### 5.3 Predicting with the loaded model

In [None]:
loaded_model.predict(val)