In [28]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline, Trainer, TrainingArguments

In [29]:
torch.cuda.is_available()

False

In [30]:
df = pd.read_csv("/Users/konraddrees/Documents/GitHub/Fake-News-Detector/detector-backend/rawdata/WELFake_Dataset.csv")

In [31]:
df = df.sample(frac=0.01)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 721 entries, 61370 to 50083
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  721 non-null    int64 
 1   title       714 non-null    object
 2   text        721 non-null    object
 3   label       721 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 28.2+ KB


In [33]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
61370,61370,ARNOLD SCHWARZENEGGER Sends A Message To Liber...,,1
2189,2189,WOW! “We Mexicans Need To Kill Donald Trump Be...,And now a message of peace and unity from one ...,1
60609,60609,Jimmy Carter recovers from dehydration scare i...,"WINNIPEG, Manitoba (Reuters) - Former U.S. Pre...",0
51565,51565,2 Friars’ Mission: Reviving a Brooklyn Church ...,"The two Franciscan friars, complete with rob...",0
39431,39431,Boy With Autism Makes His First Friend Ever An...,Approximately 1 in 68 children has an autism s...,1
47839,47839,"Something Truly Extraordinary, And Refreshing...",The CNN debate between Democratic candidates H...,1
42729,42729,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,1
37882,37882,Vice President-elect Pence says 'new hope dawn...,WASHINGTON (Reuters) - U.S. Vice President-ele...,0
10893,10893,Trump to Republican senators: Don't leave town...,WASHINGTON (Reuters) - U.S. President Donald T...,0
46977,46977,White House: Obama May Leave the Country if Tr...,0 comments \nThe White house is refusing to de...,1


In [34]:
df["text"] = df["text"].str.replace(r"^\s*[A-Z\s,.]+\s*\((Reuters|REUTERS)\)\s*-\s*", "", case=False, regex=True)
df["text"] = df["text"].str.replace(r"^\s*\((Reuters|REUTERS)\)\s*-\s*", "", case=False, regex=True)
#df[df["text"].str.contains("reuters", regex=True, case=False)]

In [35]:
df = df.dropna().drop_duplicates(subset=["text"])
df

Unnamed: 0.1,Unnamed: 0,title,text,label
61370,61370,ARNOLD SCHWARZENEGGER Sends A Message To Liber...,,1
2189,2189,WOW! “We Mexicans Need To Kill Donald Trump Be...,And now a message of peace and unity from one ...,1
60609,60609,Jimmy Carter recovers from dehydration scare i...,"Former U.S. President Jimmy Carter, appearing ...",0
51565,51565,2 Friars’ Mission: Reviving a Brooklyn Church ...,"The two Franciscan friars, complete with rob...",0
39431,39431,Boy With Autism Makes His First Friend Ever An...,Approximately 1 in 68 children has an autism s...,1
...,...,...,...,...
29464,29464,‘Today is one of the heaviest days of my life’,"I’ Two of Mohammed' \nGenerally, I use Mohamme...",1
7162,7162,COMMUNIST Students THREATEN “Students For Trum...,The breaking point is fast approaching in a di...,1
36932,36932,Trump to nominate Richard Grenell to be ambass...,President Donald Trump intends to nominate for...,0
52898,52898,WATCH: Florida Gov. Rick Scott Disrespects Pr...,Rather than call the sitting president who cou...,1


In [36]:
df.groupby("label").size()

label
0    343
1    362
dtype: int64

In [37]:
X_train, X_val, y_train, y_val = train_test_split(df["text"].tolist(), df["label"].tolist(), test_size=0.25, random_state=141, stratify=df["label"])

In [38]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [39]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=64)

In [41]:
class FakeNewsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [42]:
train_dataset = FakeNewsDataset(train_encodings, y_train)
val_dataset = FakeNewsDataset(val_encodings, y_val)

In [43]:
training_args = TrainingArguments(
    output_dir="./checkpoints",
    num_train_epochs=3,
    learning_rate=5e-2,
    eval_strategy="epoch"
)

In [44]:
X_train

['A Russian bank has reported to U.S. authorities that mysterious communications resumed recently between one of its computers and an email server tied to President Trump s business empire, and it has developed evidence the new activity may be the work of a hacker trying to create a political hoax, Circa has learned.Alfa Bank is asking the U.S. Justice Department for help solving the mystery and pledged its full cooperation.Alfa wants U.S. authorities to help unmask a computer inside the United States that it believes has been used to launch cyberattacks spoofing the appearance of a backdoor communication channel between Moscow and America s 45th president, according to a source directly familiar with the bank s request.The bank believes  these malicious attacks are designed to create the false impression that Alfa Bank has a secretive relationship with the Trump Organization,  the source said, speaking on condition of anonymity.Alfa Bank has insisted since media stories began appearin

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [46]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.696545
2,No log,0.721819
3,No log,0.694584




TrainOutput(global_step=198, training_loss=7.0477005159012, metrics={'train_runtime': 36.6903, 'train_samples_per_second': 43.172, 'train_steps_per_second': 5.397, 'total_flos': 26228544933888.0, 'train_loss': 7.0477005159012, 'epoch': 3.0})

In [47]:
model.save_pretrained("./models/fake_news_model")
tokenizer.save_pretrained("./models/fake_news_tokenizer")

('./models/fake_news_tokenizer/tokenizer_config.json',
 './models/fake_news_tokenizer/special_tokens_map.json',
 './models/fake_news_tokenizer/vocab.txt',
 './models/fake_news_tokenizer/added_tokens.json')

In [48]:
pipe = pipeline(
    "text-classification",
    model="./models/fake_news_model",
    tokenizer="./models/fake_news_tokenizer",
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [64]:
pipe("OBAMA")

[{'label': 'LABEL_1', 'score': 0.5443457961082458}]