In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tiktok/tiktok_dataset.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

df = pd.read_csv('/kaggle/input/tiktok/tiktok_dataset.csv')
df = df.dropna(subset=[
    'video_view_count',
    'video_transcription_text',
    'video_duration_sec',
    'verified_status',
    'author_ban_status'
])

df['viral'] = (df['video_view_count'] >= 100000).astype(int)
df['video_transcription_text'] = df['video_transcription_text'].str.slice(0, 500)

df = df.sample(n=1000, random_state=42)

X = df[['video_transcription_text', 'video_duration_sec', 'verified_status', 'author_ban_status']]
y = df['viral']

preprocessor = ColumnTransformer([
    ('caption', TfidfVectorizer(max_features=300, stop_words='english'), 'video_transcription_text'),
    ('categoricals', OneHotEncoder(handle_unknown='ignore'), ['verified_status', 'author_ban_status']),
    ('numerics', 'passthrough', ['video_duration_sec'])
])

model = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
#print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96       112
           1       0.91      1.00      0.95        88

    accuracy                           0.95       200
   macro avg       0.95      0.96      0.95       200
weighted avg       0.96      0.95      0.96       200



In [3]:
text_input = input("Enter the video transcription: ")
duration_input = float(input("Enter the video duration in seconds: "))
verified_status = input("Is the author verified? (yes/no): ")
author_ban_status = input("Is the author banned? (yes/no): ")

sample_df = pd.DataFrame({
    'video_transcription_text': [text_input],
    'video_duration_sec': [duration_input],
    'verified_status': [verified_status],
    'author_ban_status': [author_ban_status]
})

pred_prob = model.predict_proba(sample_df)[0][1]
print(f"\nEstimated virality score: {pred_prob * 100:.2f}%")
if pred_prob >= 0.5:
    print("This video is likely to go viral!")
else:
    print("This video might not go viral.")

Enter the video transcription:  Here’s a 30-second productivity trick that actually works. Set a timer and give yourself just half a minute to start the task you've been putting off. It’s not about finishing — it’s about starting. That tiny momentum will often carry you forward way longer than you expect.
Enter the video duration in seconds:  28
Is the author verified? (yes/no):  yes
Is the author banned? (yes/no):  no



Estimated virality score: 20.00%
This video might not go viral.
