In [31]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
from transformers import BertTokenizer, BertModel
import tqdm


In [32]:
df = pd.read_csv('FakeNewsNet.csv')
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [33]:
X = df['title']
y = df['real']

pipe1 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe1.fit(X_train, y_train)
y_pred = pipe1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8297413793103449


In [34]:
clf = pipe1.named_steps['clf']
clf.classes_

array([0, 1])

In [35]:
vectorizer = pipe1.named_steps['tfidf']
clf = pipe1.named_steps['clf']

features = vectorizer.get_feature_names_out()
coefs = clf.coef_[0]

indices = np.argsort(coefs)[::-1][:10]
print("Top 10 features:")
for i in indices:
    print(f"{features[i]}: {coefs[i]}")

Top 10 features:
season: 3.145466244369363
awards: 2.2256154380683912
star: 2.123261090317753
shares: 2.0552687875917997
reveals: 1.9582392397187565
bachelor: 1.9366692010540751
2018: 1.9115586032952376
birthday: 1.8831328757163561
celebrates: 1.8311584686455427
best: 1.7901102372419138


Fazendo download de dois outros datasets de fake news

In [36]:
df_ = pd.read_csv('fake_or_real_news.csv')
df_2 = pd.read_csv('fake_and_real_news.csv')

df_ = df_.drop(columns=['Unnamed: 0'])
df_['label'] = df_['label'].apply(lambda x: 1 if x == 'REAL' else 0)
df_2['label'] = df_2['label'].apply(lambda x: 1 if x == 'Real' else 0)

df_2

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1
3,Court Forces Ohio To Allow Millions Of Illega...,0
4,Democrats say Trump agrees to work on immigrat...,1
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,0
9896,Trump consults Republican senators on Fed chie...,1
9897,Trump lawyers say judge lacks jurisdiction for...,1
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,0


In [37]:
X_train, y_train = df_['text'], df_['label']
X_test, y_test   = df['title'], df['real']

pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf',   LogisticRegression())
])
pipe2.fit(X_train, y_train)

y_pred = pipe2.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.2526297637523711


In [38]:
vectorizer = pipe2.named_steps['tfidf']
clf = pipe2.named_steps['clf']

features = vectorizer.get_feature_names_out()
coefs = clf.coef_[0]

indices = np.argsort(coefs)[::-1][:10]
print("Top 10 features:")
for i in indices:
    print(f"{features[i]}: {coefs[i]}")

Top 10 features:
said: 7.943755141771254
cruz: 2.93321132026225
says: 2.907093336839406
state: 2.873533366064514
sanders: 2.6483833987626593
debate: 2.5384301414786523
candidates: 2.5225852259546833
president: 2.459059383470335
gop: 2.4301141638754307
conservative: 2.4265650705562147


* coeficiente > 0 → classe clf.classes_[1]
* coeficiente < 0 → classe clf.classes_[0]

Para problemas multiclasses (no modo “one‐vs‐rest” do LogisticRegression), clf.coef_ vem como uma matriz (n_classes, n_features), em que cada linha é o vetor de pesos de uma classe contra todas as outras. Basta percorrer cada linha para extrair as palavras que “puxam” para cada classe:

```python
# Célula extra: top features para cada classe em multiclass

clf        = pipe_multiclass.named_steps['clf']
vectorizer = pipe_multiclass.named_steps['tfidf']
features   = vectorizer.get_feature_names_out()
coefs      = clf.coef_            # shape = (n_classes, n_features)
classes    = clf.classes_         # rótulos das classes

for idx, label in enumerate(classes):
    top_idx = np.argsort(coefs[idx])[::-1][:10]
    print(f"Top 10 palavras para classe {label}:")
    for i in top_idx:
        print(f"  {features[i]}: {coefs[idx][i]:.4f}")
    print()
```

In [39]:
X_train, y_train = df_2['Text'], df_2['label']
X_test, y_test   = df['title'], df['real']

pipe3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf',   LogisticRegression())
])
pipe3.fit(X_train, y_train)

y_pred = pipe3.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.6448956716675289


In [40]:
vectorizer = pipe3.named_steps['tfidf']
clf = pipe3.named_steps['clf']

features = vectorizer.get_feature_names_out()
coefs = clf.coef_[0]

indices = np.argsort(coefs)[::-1][:10]
print("Top 10 features:")
for i in indices:
    print(f"{features[i]}: {coefs[i]}")

Top 10 features:
said: 11.145925903318746
reuters: 7.924675900865941
on: 5.206562436742969
washington: 4.9900478411850475
president: 3.4506006799932547
in: 2.9151255145374857
senate: 2.8632878637116597
republican: 2.854094363516202
house: 2.8482007669064378
tax: 2.803505412363487
