<a href="https://colab.research.google.com/github/jayavarthana/Cracking-the-Market-Code-with-AI-Driven-Stock-Price-Prediction-Using-Time-Series-Analysis-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
import re
import string

nltk.download('stopwords')
from nltk.corpus import stopwords

# 1. Load the dataset
df = pd.read_csv('fake_or_real_news.csv')  # Make sure to use your dataset path
print("Dataset loaded. Shape:", df.shape)
print(df.head())

# 2. Preprocess the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)  # Remove links
    text = re.sub(r'@\w+', '', text)          # Remove mentions
    text = re.sub(r'#\w+', '', text)          # Remove hashtags
    text = re.sub(r'\d+', '', text)           # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['text'] = df['text'].astype(str).apply(clean_text)

# 3. Split the dataset
X = df['text']
y = df['label']  # Ensure 'label' column contains 'FAKE' and 'REAL'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

# 5. Model training
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

# 6. Predictions and evaluation
y_pred = model.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\nAccuracy: {score*100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)

# 7. Test on custom news
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    return prediction[0]

# Example
sample_news = "The government just announced a new policy to support farmers."
print("\nPrediction for custom news:", predict_news(sample_news))


Dataset loaded. Shape: (6, 3)
                                               title  \
0  Donald Trump Sends Out Embarrassing New Year’s...   
1  Watch The Exact Moment Paul Ryan Commits Caree...   
2               NASA Finds Evidence of Water on Mars   
3  Apple unveils new iPhone 15 with cutting-edge ...   
4  BREAKING: Hillary Clinton Caught Using Illegal...   

                                                text label  
0  President Donald Trump sent out a Happy New Ye...  FAKE  
1  Speaker Paul Ryan just committed career suicid...  FAKE  
2  NASA has found the strongest evidence yet that...  REAL  
3  Apple announced its newest iPhone with upgrade...  REAL  
4  Hillary Clinton is under investigation again a...  FAKE  

Accuracy: 0.00%
Confusion Matrix:
[[0 0]
 [2 0]]

Prediction for custom news: FAKE


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
