## Trying out [Fake.br](https://github.com/roneysco/Fake.br-Corpus) dataset



In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [25]:
DATA_PATH = '../Fake.br-Corpus/preprocessed/pre-processed.csv'

df = pd.read_csv(DATA_PATH)
df.head(5)

Unnamed: 0,index,label,preprocessed_news
0,0,fake,katia abreu diz vai colocar expulsao moldura n...
1,1,fake,ray peita bolsonaro conservador fake entrevist...
2,2,fake,reinaldo azevedo desmascarado policia federal ...
3,3,fake,relatorio assustador bndes mostra dinheiro pub...
4,4,fake,radialista americano fala sobre pt vendem ilus...


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              7200 non-null   int64 
 1   label              7200 non-null   object
 2   preprocessed_news  7200 non-null   object
dtypes: int64(1), object(2)
memory usage: 168.9+ KB


In [27]:
df['label'].value_counts()

fake    3600
true    3600
Name: label, dtype: int64

### Transform label to bool

In [29]:
df['target'] = df['label'].map( {'fake': 0, 'true': 1} ).astype(int)

In [30]:
df.head(5)

Unnamed: 0,index,label,preprocessed_news,target
0,0,fake,katia abreu diz vai colocar expulsao moldura n...,0
1,1,fake,ray peita bolsonaro conservador fake entrevist...,0
2,2,fake,reinaldo azevedo desmascarado policia federal ...,0
3,3,fake,relatorio assustador bndes mostra dinheiro pub...,0
4,4,fake,radialista americano fala sobre pt vendem ilus...,0


## Tokenize

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

In [32]:
MAX_FEATURES = 10000

tokenizer=Tokenizer(num_words = MAX_FEATURES,split=' ')
tokenizer.fit_on_texts(df['preprocessed_news'].values)
X = tokenizer.texts_to_sequences(df['preprocessed_news'].values)
X = pad_sequences(X)

In [33]:
X.shape

(7200, 3472)

In [34]:
X

array([[   0,    0,    0, ...,  168, 1561,   39],
       [   0,    0,    0, ..., 6016,   59,    1],
       [   0,    0,    0, ...,  629, 1057,  786],
       ...,
       [   0,    0,    0, ..., 1863, 1001,  172],
       [   0,    0,    0, ...,  351,  134, 7224],
       [   0,    0,    0, ..., 5721,  561, 9358]], dtype=int32)

In [40]:
# split train-test

y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 101)
print(len(y_train[y_train == 0]))
print(len(y_train[y_train == 1]))

2894
2866


### Let's try out some models!

#### Suport Vector Machine

Baseado no notebook *ML_SUP_Categorical_Classification* e nas informações disponíveis na [Motivação para SVM](https://en.wikipedia.org/wiki/Support-vector_machine#Motivation) da Wikipédia.
Tentar usar apenas o filtro linear

In [44]:
from sklearn import model_selection
from sklearn import svm

In [45]:
estimator = svm.SVC()
cv = model_selection.StratifiedKFold(n_splits=3)

kernels = ['linear', 'poly']
Cs = np.linspace(0.1,2,8)
degrees = [2,3,4]
gammas = np.logspace(-5, 0, 8)

param_grid=dict(kernel=kernels, C=Cs, gamma=gammas, degree=degrees)

clf_svc = model_selection.RaniidomizedSearchCV(estimator=estimator,
                                             cv=cv,
                                             param_distributions=param_grid, 
                                             n_jobs=-1).fit(X_train, y_train)

print(clf_svc.best_score_)
print(clf_svc.best_estimator_.kernel)
print(clf_svc.best_estimator_.C)
print(clf_svc.best_estimator_.degree)
print(clf_svc.best_estimator_.gamma)


KeyboardInterrupt: 