<a href="https://colab.research.google.com/github/joedysonbezerra/classificadores-de-fake-news/blob/main/tcc_dataset_english_cloud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Bibliotecas necessárias**

In [5]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns

# Criando a base de dados com notícias fake



In [None]:
news_fake_1 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/fake.csv")


news_fake_1 = news_fake_1.drop(columns=['date','subject'])



news_fake_2 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/fake2.csv")
df_remove = news_fake_2.loc[news_fake_2['language'] != 'english']

news_fake_2 = news_fake_2.drop(df_remove.index)

news_fake_2 = news_fake_2.drop(columns=['uuid' , 'ord_in_thread','author','published','language','crawled','site_url','country','domain_rank','thread_title','spam_score','main_img_url','replies_count','participants_count','likes','comments','shares','type'])

news_fake_3 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/fake3.csv")

news_fake_3 = news_fake_3.drop(columns=['Unnamed: 0','id' , 'domain','type','url','scraped_at','inserted_at','updated_at','authors','keywords','meta_keywords','meta_description','tags','summary','source'])
news_fake_3 = news_fake_3.rename(columns={'content': 'text'})

news_fake = news_fake_1 .append(news_fake_2).append(news_fake_3)
news_fake['label'] = 0




  interactivity=interactivity, compiler=compiler, result=result)


# Criando a base de dados com notícias reais


In [None]:
news_true_1 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/true.csv")
news_true_1 = news_true_1.drop(columns=['date','subject'])

news_true_2 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/true2.csv")
news_true_2 = news_true_2.drop(columns=['Unnamed: 0','id','publication','author','date','year','month','url'])
news_true_2 = news_true_2.rename(columns={'content': 'text'})

news_true_3 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/true3.csv")
news_true_3 = news_true_3.drop(columns=['Unnamed: 0','id','publication','author','date','year','month','url'])
news_true_3 = news_true_3.rename(columns={'content': 'text'})


news_true_4 = pd.read_csv("/content/drive/MyDrive/tcc-dataset/true4.csv")
news_true_4 = news_true_4.drop(columns=['Unnamed: 0','id','publication','author','date','year','month','url'])
news_true_4 = news_true_4.rename(columns={'content': 'text'})


news_true = news_true_1.append(news_true_2).append(news_true_3).append(news_true_4)
news_true['label'] = 1




# Criando uma Base de Dados

In [None]:
news = news_true.append(news_fake)
news = news.dropna() 
news = news.sample(frac=0.1)

news.groupby('label').label.count()

label
0    45937
1    16268
Name: label, dtype: int64

# **Processando**


## **TF-IDF - Com Tratamento**


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


config_tfidf = TfidfVectorizer(stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 2),lowercase=True, max_features=10000)

tfidf = config_tfidf.fit_transform(news.text)



## **Separação da base de teste e treino**


In [None]:
from sklearn.model_selection import train_test_split
x_treino, x_teste, y_treino, y_teste = train_test_split(tfidf, news.label, test_size=0.2,random_state = 42)

## **Matriz de confusão - Plot**

In [16]:
def confusion_matrix_plot(cf_matrix):
  group_names = ['Verdadeiro Negativo','Falso Positivo','Falso Negativo','Verdadeiro Positivo']

  group_counts = ["{0:0.0f}".format(value) for value in
                  cf_matrix.flatten()]

  labels_confusion_matrix = [f"{v1}\n{v2}" for v1, v2 in
            zip(group_names,group_counts)]
  labels_confusion_matrix = np.asarray(labels_confusion_matrix).reshape(2,2)
  sns.heatmap(cf_matrix, annot=labels_confusion_matrix, fmt='', cmap='Blues')

## **LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2'] 
c_values = [ 1.0, 100, 300]

grid = dict(solver=solvers,penalty=penalty,C=c_values)

cv = StratifiedKFold(n_splits=10)
grid = GridSearchCV(estimator = model,         
                    param_grid = grid,            
                    cv = cv,
                    scoring = 'accuracy', 
                    refit = 'accuracy',
                    n_jobs=-1,
                    verbose=10)
grid.fit(x_treino,y_treino).best_params_
pd.DataFrame(grid.cv_results_)[['params', 
'mean_test_score',
'std_test_score',]]

In [None]:
predict = grid.predict(x_teste)
accuracy_score(y_teste, predict)

In [None]:
confusion_matrix_plot(confusion_matrix(y_teste, predict))

## **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
n_estimators = [100, 1000]
max_features = ['sqrt', 'log2']

grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = StratifiedKFold(n_splits=10)
grid = GridSearchCV(estimator = model,         
                    param_grid = grid,            
                    cv = cv,
                    scoring = 'accuracy', 
                    refit = 'accuracy',
                    n_jobs=-1,
                    verbose=10)
grid.fit(x_treino,y_treino).best_params_
pd.DataFrame(grid.cv_results_)[['params', 
'mean_test_score',
'std_test_score',]]

In [None]:
predict = grid.predict(x_teste)
accuracy_score(y_teste, predict)

In [None]:
confusion_matrix_plot(confusion_matrix(y_teste, predict))

## **SVM**

In [None]:
from sklearn.svm import SVC



model = SVC()
kernel = ['linear','rbf',]
C = [100, 300]


grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10)
grid = GridSearchCV(estimator = model,         
                    param_grid = grid,            
                    cv = cv,
                    scoring = 'accuracy', 
                    refit = 'accuracy',
                    n_jobs=-1,
                    verbose=100)
grid.fit(x_treino_2,y_treino_2).best_params_
pd.DataFrame(grid.cv_results_)[['params', 
'mean_test_score',
'std_test_score',]]

In [None]:
predict = grid.predict(x_teste)
accuracy_score(y_teste, predict)

In [None]:
confusion_matrix_plot(confusion_matrix(y_teste, predict))

## **MLP**

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
grid = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'alpha': [0.05],
}

cv = StratifiedKFold(n_splits=10)
grid = GridSearchCV(estimator = model,         
                    param_grid = grid,            
                    cv = cv,
                    scoring = 'accuracy', 
                    refit = 'accuracy',
                    n_jobs=-1,
                    verbose=10)
grid.fit(x_treino,y_treino).best_params_
pd.DataFrame(grid.cv_results_)[['params', 
'mean_test_score',
'std_test_score',]]


In [None]:
predict = grid.predict(x_teste)
accuracy_score(y_teste, predict)

In [None]:
confusion_matrix_plot(confusion_matrix(y_teste, predict))

## **Naives Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB


model = GaussianNB()
grid =  {'var_smoothing': np.logspace(0,-9, num=10)}
cv = StratifiedKFold(n_splits=10)
grid = GridSearchCV(estimator = model,         
                    param_grid = grid,            
                    cv = cv,
                    scoring = 'accuracy', 
                    refit = 'accuracy',
                    n_jobs=1,
                    verbose=10)
grid.fit(x_treino.todense(),y_treino).best_params_
pd.DataFrame(grid.cv_results_)[['params', 
'mean_test_score',
'std_test_score',]]


In [None]:
predict = grid.predict(x_teste.todense())
accuracy_score(y_teste, predict)

In [None]:
confusion_matrix_plot(confusion_matrix(y_teste, predict))