In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fake-news-detection-datasets:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2712039%2F4679796%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240821%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240821T115455Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D55271f02062055594cd12c1a7a7b7c81e8edea11b1313e295b40e7af109a20f164ae8e27f7b9cae68c56c075c76e22cb4051a1654bad972928fa99ed252fddb99457a219b1d667d62443fa43efa5a9452eefbaf862a91bf746b7679b48a2fa7adc732b7956786055865fd554725eb0ad224ace7f1ab1432d2743838afca7f949f84638da9185beef377cfda4ed8afe54f3355c5f23180ab50506a57b8c59cdf24868c62716cf76fe26ed5d55959937ffd6b3f5b16b8600b27fa4cebd114dd4a26e51ea7aec403916b0f669fdedcaf7b9caa8b596817dd86fd65e934f385997d95e9223e3a6746ebc6f163c6313e6af85804a89fd0597c8f8d39fa6673bc0b946'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import regex as re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
real = pd.read_csv("/kaggle/input/fake-news-detection-datasets/News _dataset/True.csv")
fake = pd.read_csv("/kaggle/input/fake-news-detection-datasets/News _dataset/Fake.csv")

In [None]:
real['target']=1
fake['target']=0

data = pd.concat([real , fake] , ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)


display(data.head())
print('-'*40)

print('shape' , data.shape)
print('-'*40)

display(data.dtypes )
print('-'*40)

display(data.isna().sum())


In [None]:
data.target.value_counts(normalize=True)


In [None]:
import seaborn as sns  # Import de la bibliothèque Seaborn pour la visualisation de données

def create_distribution(dataFile):
    plt.figure(figsize=(5, 5))

    return sns.countplot(x='target', data=dataFile,palette='Blues_d')
create_distribution(data)



In [None]:

sns.set_palette("crest")
sns.set_style("whitegrid")

plt.figure(figsize=(6, 6))
data['subject'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Percentage of Our Subjects')
plt.ylabel(None)
plt.show()


In [None]:
# def encode_subject(label):
#     if label  in ["politicsNews",'politics' ,'Government News','left-news']:
#         return "politics"
#     elif label  in ['worldnews' ,'News']:
#         return "world news"
#     else:
#         return "US_News"

# data["subject"]=data["subject"].apply(encode_subject)
data.subject=data.subject.replace({'politics':'PoliticsNews','politicsNews':'PoliticsNews'})


In [None]:
sns.set_palette("crest")
sns.set_style("whitegrid")

plt.figure(figsize=(6, 6))
data['subject'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Percentage of Our Subjects')
plt.ylabel(None)
plt.show()


***

#### As an estimate what subjects have more fake news **rs1** , and which of them have real news **rs2** ?


In [None]:
rs1 = data[data['target'] == 0].groupby(['subject'], as_index=False).size()
rs1 = rs1.rename(columns={'size': 'count'}).sort_values(by='count', ascending=False)
print(rs1)

In [None]:
rs2 = data[data['target'] == 1].groupby(['subject'], as_index=False).size()
rs2 = rs2.rename(columns={'size': 'count'}).sort_values(by='count', ascending=False)
print(rs2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Fake news distribution
sns.barplot(ax=axs[0], x='count', y='subject', data=rs1, palette='viridis')
axs[0].set_title('Distribution of Fake News')
axs[0].set_xlabel('Count')
axs[0].set_ylabel('Subject')

# Real news distribution
sns.barplot(ax=axs[1], x='count', y='subject', data=rs2, palette='viridis')
axs[1].set_title('Distribution of Real News')
axs[1].set_xlabel('Count')
axs[1].set_ylabel('Subject')

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
data['date'] = pd.to_datetime(data['date'],format='mixed', dayfirst=True,errors='coerce')

data['Year'] = data['date'].dt.year
data['Month'] = data['date'].dt.month_name()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Year', data=data, order=data['Year'].value_counts().index, palette='crest')
plt.title('Distribution of Years')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Month', data=data, order=data['Month'].value_counts().index, palette='crest')
plt.title('Distribution of Months')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='subject', data=data , order=data['subject'].value_counts().index, palette='crest')
plt.title('Distribution of Subject')
plt.xlabel('Subject')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
import nltk
df = pd.DataFrame()
def count_words(column):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(column)
    return len(new_words)

df["n_words_in_title"]=data["title"].apply(count_words)
df["n_words_in_text"]=data["text"].apply(count_words)
df['target']=data['target']

In [None]:
fig=plt.figure(figsize=[6,5])
# fig.patch.set_alpha(0.7)

plt.title("Number of words in the title.",size=18)
sns.boxplot(data=df, x="target",y="n_words_in_title",showfliers=False,width=0.4,color="#0047AB")

In [None]:
fig=plt.figure(figsize=[6,5])
# fig.patch.set_alpha(0.7)

plt.title("Number of words in the text.",size=18)
sns.boxplot(data=df, x="target",y="n_words_in_text",showfliers=False,width=0.4,color="#0047AB")

In [None]:
data['final'] =  data['title'] + " " + data['subject']


In [None]:
import re  # Import the regular expressions module
import string  # Import the string module containing punctuation

def wordopt(text):
    # Convert the entire text to lowercase
    text = text.lower()

    # Remove text within square brackets, including the brackets
    text = re.sub('\[.*?\]', '', text)

    # Replace all non-alphabetic characters with a space
    text = re.sub("\\W", " ", text)

    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub('<.*?>', '', text)

    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove words containing digits
    text = re.sub('\w*\d\w*', '', text)

    return text  # Return the preprocessed text


In [None]:
data['final'] = data['final'].apply(wordopt)
data.final

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

X_train,X_test,y_train,y_test = train_test_split(data['final'],data['target'],test_size=0.3)
# cv = CountVectorizer(min_df=0,max_df=1,ngram_range=(1,2))

# cv_train = cv.fit_transform(X_train)
# cv_test = cv.transform(X_test)
vectorization = TfidfVectorizer()
cv_train = vectorization.fit_transform(X_train)
cv_test = vectorization.transform(X_test)

print('Train shape: ',cv_train.shape)
print('Test shape: ',cv_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


nb = MultinomialNB()
nb.fit(cv_train, y_train)
pred_nb = nb.predict(cv_test)
score = accuracy_score(y_test, pred_nb)
print("Accuracy Score: ",score)


***

In [None]:
# data['final2'] =  data['text'] + " " + data['title'] + " " + data['subject']
data['final2'] =  data['text'] + " " + data['title'] + " " + data['subject']

data['final2'] = data['final2'].apply(wordopt)
data['final2'].head(3)


In [None]:
X_train,X_test,y_train,y_test = train_test_split(data['final2'],data['target'],test_size=0.2)
# cv = CountVectorizer(min_df=0,max_df=1,ngram_range=(1,2))
vectorization = TfidfVectorizer()

cv_train = vectorization.fit_transform(X_train)
cv_test = vectorization.transform(X_test)

print('Train shape: ',cv_train.shape)
print('Test shape: ',cv_test.shape)



nb = MultinomialNB()
nb.fit(cv_train, y_train)

pred_nb = nb.predict(cv_test)
score = accuracy_score(y_test, pred_nb)
print("Accuracy Score: ",score)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
listt=[]
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(data['final2'], data['target'], test_size=0.2)

# Vectorize the data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Creating a combined pipeline with model training
pipeline_combined = Pipeline([
    ('model', LogisticRegression())  # Default model, will be replaced later
])

# Fit and evaluate each model in the combined pipeline
for model_name, model in [('logistic', LogisticRegression()), ('decision_tree', DecisionTreeClassifier()), ('random_forest', RandomForestClassifier())]:
    pipeline_combined.set_params(model=model)  # Set the current model in the pipeline
    pipeline_combined.fit(X_train_vec, y_train)  # Fit the pipeline with vectorized data
    y_pred = pipeline_combined.predict(X_test_vec)  # Predict with the current model using vectorized test data
    accuracy = accuracy_score(y_test, y_pred)
    listt.append(accuracy)
    print(f"{model_name.capitalize().replace('_', ' ')} Accuracy:", accuracy)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest']
accuracies = listt

# Create a bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x=model_names, y=accuracies)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Different Models')
plt.ylim(0, 1)
plt.show()
