<a href="https://colab.research.google.com/github/issaWarasna/Challenges-/blob/main/Assigment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import standard Python and NLP libraries

import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download required NLTK data for stopwords and tokenization

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Load the fake news dataset

fake_dataset = pd.read_csv('Fake.csv')
fake_dataset.head() # to show the few rows of dataset


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [stemmer.stem(word) for word in words]  # Apply stemming
    return ' '.join(words)


In [5]:
fake_dataset['clean_text'] = fake_dataset['text'].apply(clean_text)
fake_dataset[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,Donald Trump just couldn t wish all Americans ...,donald trump wish american happi new year leav...
1,House Intelligence Committee Chairman Devin Nu...,hous intellig committe chairman devin nune go ...
2,"On Friday, it was revealed that former Milwauk...",friday reveal former milwauke sheriff david cl...
3,"On Christmas day, Donald Trump announced that ...",christma day donald trump announc would back w...
4,Pope Francis used his annual Christmas Day mes...,pope franci use annual christma day messag reb...


In [7]:
nltk.download('punkt_tab')
fake_dataset['tokens'] = fake_dataset['clean_text'].apply(word_tokenize)  # Tokenize the cleaned text

fake_dataset['word_count'] = fake_dataset['tokens'].apply(len)  # Count the number of words in each document

avg_word_count = fake_dataset['word_count'].mean()   # Calculate average number of words per document


print(f"Average number of words per document: {avg_word_count:.2f}")
fake_dataset[['clean_text', 'tokens', 'word_count']].head()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Average number of words per document: 235.41


Unnamed: 0,clean_text,tokens,word_count
0,donald trump wish american happi new year leav...,"[donald, trump, wish, american, happi, new, ye...",287
1,hous intellig committe chairman devin nune go ...,"[hous, intellig, committe, chairman, devin, nu...",179
2,friday reveal former milwauke sheriff david cl...,"[friday, reveal, former, milwauke, sheriff, da...",339
3,christma day donald trump announc would back w...,"[christma, day, donald, trump, announc, would,...",265
4,pope franci use annual christma day messag reb...,"[pope, franci, use, annual, christma, day, mes...",211


In [8]:
# Initialize and apply CountVectorizer

count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(fake_dataset['clean_text'])

# Initialize and apply TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(fake_dataset['clean_text'])

# Show the shape (documents, features) of both matrices

print("CountVectorizer matrix shape:", count_matrix.shape)
print("TfidfVectorizer matrix shape:", tfidf_matrix.shape)

print("\nFirst 10 features from CountVectorizer:")
print(count_vectorizer.get_feature_names_out()[:10])

print("\nFirst 10 features from TfidfVectorizer:")
print(tfidf_vectorizer.get_feature_names_out()[:10])


CountVectorizer matrix shape: (23481, 163522)
TfidfVectorizer matrix shape: (23481, 163522)

First 10 features from CountVectorizer:
['00' '000' '0000' '000000017' '000048' '00007' '0006' '00075' '00076'
 '0009']

First 10 features from TfidfVectorizer:
['00' '000' '0000' '000000017' '000048' '00007' '0006' '00075' '00076'
 '0009']
