In [3]:
import re
import spacy
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load spacy model only ONCE (not inside the function!)
nlp = spacy.load("en_core_web_sm")

# Preprocessing function (for FunctionTransformer)
def preprocessing(texts):
    cleaned_texts = []
    for text in texts:
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)
        doc = nlp(text)
        lemmatized = [token.lemma_ for token in doc if not token.is_stop]
        cleaned_texts.append(" ".join(lemmatized))
    return cleaned_texts

# Wrap function into FunctionTransformer
transform = FunctionTransformer(preprocessing, validate=False)

data=pd.read_csv("news.csv")
df = pd.DataFrame(data)
df = df.drop(columns=["Unnamed: 0"])
df["content"] = df["title"] + " " + df["text"]

X = df["content"]
y = df["label"]

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build pipeline

model = Pipeline([
    ("preprocessing", transform),
    ("vectorizer", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=200))
])

# Train
model.fit(x_train, y_train)

# Evaluate
print("Train Accuracy:", model.score(x_train, y_train))
print("Test Accuracy:", model.score(x_test, y_test))


Train Accuracy: 0.9536306235201263
Test Accuracy: 0.9116022099447514
