<a href="https://colab.research.google.com/github/jatinmeenaa/clickbait_detector/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install core libraries
!pip install pandas numpy scikit-learn nltk joblib matplotlib seaborn



In [5]:
import pandas, numpy, sklearn, nltk, joblib, matplotlib, seaborn
print("✅ All libraries imported successfully!")


✅ All libraries imported successfully!


In [7]:
import pandas as pd
df = pd.read_csv('/content/clickbait_data.csv')
print(df.shape)
print(df.columns)
df.head(3)


(32000, 2)
Index(['headline', 'clickbait'], dtype='object')


Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1


In [8]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'http\S+', '', text)              # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)             # remove special chars, numbers
    tokens = [word for word in text.split() if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# apply preprocessing
df['clean_text'] = df['headline'].apply(preprocess)
df[['headline', 'clean_text']].head(10)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,headline,clean_text
0,Should I Get Bings,get bings
1,Which TV Female Friend Group Do You Belong In,tv female friend group belong
2,"The New ""Star Wars: The Force Awakens"" Trailer...",new star war force awakens trailer give chill
3,"This Vine Of New York On ""Celebrity Big Brothe...",vine new york celebrity big brother fucking pe...
4,A Couple Did A Stunning Photo Shoot With Their...,couple stunning photo shoot baby learning inop...
5,How To Flirt With Queer Girls Without Making A...,flirt queer girl without making total fool
6,32 Cute Things To Distract From Your Awkward T...,cute thing distract awkward thanksgiving
7,If Disney Princesses Were From Florida,disney princess florida
8,What's A Quote Or Lyric That Best Describes Yo...,whats quote lyric best describes depression
9,Natalie Dormer And Sam Claflin Play A Game To ...,natalie dormer sam claflin play game see theyd...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(df['clean_text'])
y = df['clickbait']

print("Feature matrix shape:", X.shape)


Feature matrix shape: (32000, 5000)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=300)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9490625
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3127
           1       0.96      0.94      0.95      3273

    accuracy                           0.95      6400
   macro avg       0.95      0.95      0.95      6400
weighted avg       0.95      0.95      0.95      6400



In [11]:
import joblib

joblib.dump(model, 'clickbait_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!


In [12]:
def predict_headline(headline):
    text = preprocess(headline)
    vec = tfidf.transform([text])
    pred = model.predict(vec)[0]
    return "Clickbait" if pred == 1 else "Not Clickbait"

samples = [
    "You won't believe what happened next!",
    "Government announces new education policy for rural areas",
    "10 tricks to lose weight while you sleep",
]

for s in samples:
    print(s, "→", predict_headline(s))


You won't believe what happened next! → Clickbait
Government announces new education policy for rural areas → Not Clickbait
10 tricks to lose weight while you sleep → Clickbait
