# Project | Natural Language Processing
### News Headline: Real or Fake News

### Prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
# pip install pandas scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os
from sklearn.metrics import f1_score, confusion_matrix, classification_report

- Read Data
- Reduce the training set to speead up development. 

In [None]:
OUTPUT_PATH = 'dataset/predicted_validation_data_from_rev2.csv'

In [None]:
## Read training data
data = pd.read_csv(r"C:\Users\happy\Documents\ironhack\Week9\Group\project-nlp-challenge\dataset\data.csv")
# Reduce the training set to speed up development. 
# Modify for final system
data_1000 = data.head(1000)
print(data.shape)
data_1000.fillna("",inplace=True)
data
data_1000

In [None]:
## Read validation data
val_data = pd.read_csv(r"C:\Users\happy\Documents\ironhack\Week9\Group\project-nlp-challenge\dataset\validation_data.csv")
val_data.head(10)

## Data Preprocessing

Use only title and label

In [None]:
df = data[['label', 'title']].dropna()
df

Split train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['title'], df['label'], test_size=0.2, random_state=42)

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Model training (Logistic Regression)

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

Evaluation

In [None]:
y_pred = model.predict(X_test_tfidf)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# F1 Score

In [None]:
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

preprocess validation data

In [None]:
val_titles = val_data['title'].fillna("")
val_titles

Transform titles using the same vectorizer

In [None]:
val_tfidf = vectorizer.transform(val_titles)

Predict labels

In [None]:
predicted_labels = model.predict(val_tfidf)

Replace label column

In [None]:
val_data['label'] = predicted_labels
val_data

# Save the modified validation file

In [None]:
val_data.to_csv(OUTPUT_PATH, index=False)

print(f"Predicted labels saved to {OUTPUT_PATH}")