In [1]:
import pandas as pd

# Read the CSV files into DataFrames
fake_news = pd.read_csv(
    'https://fake-news-ml-project.s3.eu-north-1.amazonaws.com/Fake.csv')
true_news = pd.read_csv(
    'https://fake-news-ml-project.s3.eu-north-1.amazonaws.com/True.csv')

# Set the values of the target column
fake_news['class'] = 0
true_news['class'] = 1

# Merge two DataFrames
news_merged = pd.concat([fake_news, true_news], axis=0).reset_index(drop=True)

# Display the first 5 rows of news_merged
news_merged.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [2]:
# Remove unnecessary columns
news = news_merged.drop(['title', 'subject', 'date'], axis=1)

# Remove duplicates
news = news.drop_duplicates()

# Shuffle the DataFrame and reset the index
news = news.sample(frac=1).reset_index(drop=True)

# Check for missing values (NaN values)
news.isna().sum()

text     0
class    0
dtype: int64

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove punctuation with regex
news['text'] = news['text'].str.replace(r'[^\w\s]', '', regex=True)

vectorization = TfidfVectorizer()

# Vectorize the texts of the articles
texts_vectorized = vectorization.fit_transform(news['text'])

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts_vectorized, news['class'], test_size=0.25)

lr = LogisticRegression()

# Train the Logistic Regression model
lr.fit(X_train, y_train)

# Make predictions on the test set
pred_lr = lr.predict(X_test)

lr.score(X_test, y_test)

0.9865452287311116

In [5]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
dt = DecisionTreeClassifier()

# Train the model
dt.fit(X_train, y_train)

# Make predictions
pred_dt = dt.predict(X_test)

# Evaluate the model (accuracy score)
dt.score(X_test, y_test) # This model performs better

0.9955495756572138

In [7]:
news = """Scientists in Germany have discovered a method to instantly cure cancer using a simple herbal tea blend. This 'miracle tea' has been proven effective in all forms of cancer, and the discovery was reportedly suppressed by pharmaceutical companies.
"""

# Creating a DataFrame with the text
df = pd.DataFrame([news], columns=['text'])

# Removing punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)

#Vectorizing the text
text = vectorization.transform(df['text'])

# Making predictions
print('Logistic Regression prediction: ' + str(lr.predict(text)))
print('Decision tree prediction: ' + str(dt.predict(text)))

Logistic Regression prediction: [0]
Decision tree prediction: [0]
