In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
# Read data
file = "youtube_shorts_tiktok_trends_2025.csv"
df = pd.read_csv(file)
print(f"shape: {df.shape}")
print(f"Column name: {df.columns.tolist()}")

shape: (48079, 58)
Column name: ['platform', 'country', 'region', 'language', 'category', 'hashtag', 'title_keywords', 'author_handle', 'sound_type', 'music_track', 'week_of_year', 'duration_sec', 'views', 'likes', 'comments', 'shares', 'saves', 'engagement_rate', 'trend_label', 'source_hint', 'notes', 'device_type', 'upload_hour', 'genre', 'trend_duration_days', 'trend_type', 'engagement_velocity', 'dislikes', 'comment_ratio', 'share_rate', 'save_rate', 'like_dislike_ratio', 'publish_dayofweek', 'publish_period', 'event_season', 'tags', 'sample_comments', 'creator_avg_views', 'creator_tier', 'season', 'publish_date_approx', 'year_month', 'title', 'title_length', 'has_emoji', 'avg_watch_time_sec', 'completion_rate', 'device_brand', 'traffic_source', 'is_weekend', 'row_id', 'engagement_total', 'like_rate', 'dislike_rate', 'engagement_per_1k', 'engagement_like_rate', 'engagement_comment_rate', 'engagement_share_rate']


In [17]:
# Check for Missing Values
print("Missing Values:")
print(df.isnull().sum().sort_values(ascending=False))
# Fill in the missing values in the numerical column
numerical_cols = ['views', 'likes', 'comments', 'shares', 'saves', 'engagement_rate', 
                'duration_sec', 'creator_avg_views', 'avg_watch_time_sec', 'completion_rate']
for col in numerical_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())
# Fill in the missing values in the category column
categorical_cols = ['category', 'genre', 'country', 'trend_label', 'device_type']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).fillna('Unknown')
# Fill in the missing values in the text column
text_cols = ['hashtag', 'title_keywords', 'tags']
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].fillna('')

Missing Values:
platform                 0
country                  0
region                   0
language                 0
category                 0
                        ..
engagement_share_rate    0
hashtag_clean            0
title_keywords_clean     0
tags_clean               0
num_hashtags             0
Length: 62, dtype: int64


In [8]:
# Check for duplicate rows
print(f"Number of repeated rows: {df.duplicated().sum()}")
df = df.drop_duplicates()
# Ensure that row_id is unique
print(f"The total number of row_ids: {df['row_id'].nunique()}")

Number of repeated rows: 0
The total number of row_ids: 48079


In [12]:
# Date Conversion
df['publish_date_approx'] = pd.to_datetime(df['publish_date_approx'], errors='coerce')
# Bool Tpye Conversion
df['has_emoji'] = df['has_emoji'].astype(bool)
df['is_weekend'] = df['is_weekend'].astype(bool)
# Category Type Conversion
categorical_columns = ['platform', 'country', 'region', 'language', 'category', 'trend_label', 'trend_type', 'device_type', 'genre', 'creator_tier', 'season', 'traffic_source']
for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].astype('category')

In [13]:
# Define numerical column
numerical_cols_for_outlier = ['views', 'likes', 'comments', 'shares', 'saves', 'engagement_rate', 'duration_sec', 'creator_avg_views']
# Handling Outliers Using the IQR Method
for col in numerical_cols_for_outlier:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Tail Shortening Procedure
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

In [18]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    # Remove special characters and retain letters, numbers, and spaces.
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove unnecessary spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Clean the text field
df['hashtag_clean'] = df['hashtag'].apply(clean_text)
df['title_keywords_clean'] = df['title_keywords'].apply(clean_text)
df['tags_clean'] = df['tags'].apply(clean_text)
# Number of extracted tags
df['num_hashtags'] = df['hashtag_clean'].apply(lambda x: len(x.split()) if x else 0)
print(f"shape: {df.shape}")

shape: (48079, 62)


In [19]:
# Extracting Features from Dates
df['publish_dayofweek_num'] = df['publish_date_approx'].dt.dayofweek
df['publish_month'] = df['publish_date_approx'].dt.month
df['publish_day'] = df['publish_date_approx'].dt.day
# Create time period features based on 'upload_hour'
def get_time_period(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['time_period'] = df['upload_hour'].apply(get_time_period)

In [20]:
# Basic Interaction Characteristics
df['total_engagement'] = df['likes'] + df['comments'] + df['shares'] + df['saves']
df['engagement_per_view'] = df['total_engagement'] / (df['views'] + 1)
df['like_comment_ratio'] = df['likes'] / (df['comments'] + 1)
df['virality_score'] = (df['shares'] + df['saves']) / (df['views'] + 1)
df['completion_ratio'] = df['avg_watch_time_sec'] / (df['duration_sec'] + 1)
# Interaction Quality Indicators
df['engagement_quality'] = (df['like_rate'] + df['comment_ratio'] + df['share_rate']) / 3

In [21]:
# Title Characteristics
df['title_length'] = df['title_keywords_clean'].str.len()
df['title_word_count'] = df['title_keywords_clean'].str.split().str.len()
# Key Feature Words
def extract_keyword_clusters(title):
    title_lower = title.lower()
    clusters = []
    
    if any(word in title_lower for word in ['routine', 'daily', 'morning', 'night']):
        clusters.append('routine')
    if any(word in title_lower for word in ['tips', 'tutorial', 'how to', 'guide']):
        clusters.append('tutorial')
    if any(word in title_lower for word in ['challenge', 'trend', 'dance']):
        clusters.append('challenge')
    if any(word in title_lower for word in ['comedy', 'funny', 'pov', 'relatable']):
        clusters.append('comedy')
    if any(word in title_lower for word in ['makeup', 'grwm', 'skincare', 'beauty']):
        clusters.append('beauty')
    if any(word in title_lower for word in ['gaming', 'fortnite', 'valorant', 'minecraft']):
        clusters.append('gaming')
    if any(word in title_lower for word in ['food', 'recipe', 'cooking', 'street food']):
        clusters.append('food')
    
    return ','.join(clusters) if clusters else 'other'

df['content_cluster'] = df['title_keywords_clean'].apply(extract_keyword_clusters)

In [22]:
# Creator Level Coding
creator_tier_mapping = {'Low': 1, 'Mid': 2, 'High': 3}
df['creator_tier_encoded'] = df['creator_tier'].map(creator_tier_mapping).fillna(1)
# Creator Performance Indicators
df['creator_performance_ratio'] = df['views'] / (df['creator_avg_views'] + 1)
df['creator_consistency'] = df['creator_avg_views'] / (df['views'] + 1)

In [23]:
# Platform Features
df['is_tiktok'] = (df['platform'] == 'TikTok').astype(int)
df['is_youtube'] = (df['platform'] == 'YouTube').astype(int)
# Equipment Type Code
device_type_mapping = {
    'Android': 'mobile', 'iOS': 'mobile', 'iPhone': 'mobile',
    'Samsung': 'mobile', 'Huawei': 'mobile', 'Xiaomi': 'mobile',
    'Pixel': 'mobile', 'Oppo': 'mobile', 'Vivo': 'mobile',
    'Desktop': 'desktop', 'Web': 'desktop', 'Other': 'other'
}
df['device_category'] = df['device_type'].map(device_type_mapping).fillna('other')
# Traffic Source Characteristics
df['is_algorithmic_feed'] = (df['traffic_source'].isin(['ForYou', 'Following'])).astype(int)
df['is_search'] = (df['traffic_source'] == 'Search').astype(int)

In [24]:
# Trend Indicators
df['trend_momentum'] = df['engagement_velocity'] / (df['views'] + 1)
# Released on that day
df['content_freshness'] = (df['trend_duration_days'] == 1).astype(int) 
# Trend Type Coding
trend_type_mapping = {'Short': 1, 'Medium': 2, 'Evergreen': 3}
df['trend_type_encoded'] = df['trend_type'].map(trend_type_mapping).fillna(1)

In [25]:
# Audio Type Coding
sound_type_mapping = {'original': 1, 'trending': 2, 'licensed': 3}
df['sound_type_encoded'] = df['sound_type'].map(sound_type_mapping).fillna(1)
# Extract features from music_track
def extract_music_genre(track):
    track_lower = str(track).lower()
    if 'loop' in track_lower or 'beat' in track_lower:
        return 'electronic'
    elif 'piano' in track_lower or 'calm' in track_lower:
        return 'calm'
    elif 'trap' in track_lower or 'hype' in track_lower:
        return 'energetic'
    elif 'vibe' in track_lower or 'chill' in track_lower:
        return 'chill'
    else:
        return 'other'

df['music_genre'] = df['music_track'].apply(extract_music_genre)


In [26]:
# Definition of Best-Sellers Based on engagement_rate
engagement_threshold = df['engagement_rate'].quantile(0.75)
view_threshold = df['views'].quantile(0.75)
df['is_viral'] = ((df['engagement_rate'] > engagement_threshold) & 
                 (df['views'] > view_threshold)).astype(int)
# Trend Definition Based on trend_label
df['is_trending'] = (df['trend_label'] == 'rising').astype(int)
# Comprehensive Trend Scoring
df['trend_score'] = (
    0.4 * (df['engagement_rate'] / df['engagement_rate'].max()) +
    0.3 * (df['share_rate'] / (df['share_rate'].max() + 1e-6)) +
    0.3 * (df['completion_rate'] / df['completion_rate'].max())
)

print("Distribution of the target variable:")
print(f"The number of popular videos: {df['is_viral'].sum()}")
print(f"The number of trending videos: {df['is_trending'].sum()}")

Distribution of the target variable:
The number of popular videos: 2999
The number of trending videos: 12102


In [27]:
# Low cardinality features are encoded using One-Hot method
low_cardinality_cols = ['platform', 'device_category', 'time_period', 'music_genre']
df_encoded = pd.get_dummies(df, columns=low_cardinality_cols, prefix=low_cardinality_cols)
# Use target encoding or frequency encoding for high-frequency features
high_cardinality_cols = ['country', 'category', 'genre', 'author_handle']

for col in high_cardinality_cols:
    if col in df.columns:
        # Frequency Encoding
        freq_encoding = df[col].value_counts(normalize=True)
        df[f'{col}_freq_encoded'] = df[col].map(freq_encoding)
        # Target encoding (using the mean of the target variable)
        if 'is_viral' in df.columns:
            target_encoding = df.groupby(col)['is_viral'].mean()
            df[f'{col}_target_encoded'] = df[col].map(target_encoding)

In [28]:
# Select the numerical features that need to be standardized
numerical_features_to_scale = [
    'views', 'likes', 'comments', 'shares', 'saves', 
    'duration_sec', 'creator_avg_views', 'title_length',
    'num_hashtags', 'engagement_velocity', 'trend_duration_days'
]
# Filter out the columns that exist in the dataset
existing_numerical_features = [col for col in numerical_features_to_scale if col in df.columns]
# Standardization
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[existing_numerical_features] = scaler.fit_transform(df_encoded[existing_numerical_features])

print(f"The shape of the processed dataset: {df_scaled.shape}")

The shape of the processed dataset: (48079, 99)


In [29]:
# Define the final feature set
final_features = [
    # Basic characteristics
    'duration_sec', 'title_length', 'num_hashtags', 'has_emoji',
    
    # Interaction characteristics
    'engagement_rate', 'engagement_per_view', 'virality_score', 
    'completion_ratio', 'engagement_quality',
    
    # Creator characteristics
    'creator_tier_encoded', 'creator_performance_ratio',
    
    # Time characteristics
    'publish_dayofweek_num', 'upload_hour', 'is_weekend',
    
    # Platform characteristics
    'is_tiktok', 'is_youtube', 'is_algorithmic_feed', 'is_search',
    
    # Trend characteristics
    'trend_type_encoded', 'trend_momentum', 'content_freshness',
    
    # Coding characteristics
    'country_freq_encoded', 'category_freq_encoded'
]

# Add the features encoded with One-Hot method
one_hot_features = [col for col in df_scaled.columns if any(prefix in col for prefix in low_cardinality_cols)]
final_features.extend(one_hot_features)

# Select the features that exist in the dataset
available_features = [col for col in final_features if col in df_scaled.columns]

print(f"The number of available features: {len(available_features)}")
print("The first 10 characteristics:", available_features[:10])

The number of available features: 34
The first 10 characteristics: ['duration_sec', 'title_length', 'num_hashtags', 'has_emoji', 'engagement_rate', 'engagement_per_view', 'virality_score', 'completion_ratio', 'engagement_quality', 'creator_tier_encoded']


In [30]:
output_file = 'processed_short_video_trends_2025.csv'
df_scaled[available_features + ['is_viral', 'is_trending', 'trend_score']].to_csv(output_file, index=False)

print(f"The dataset has been saved as: {output_file}")
print(f"Final feature quantity: {len(available_features)}")
print(f"Target variable: is_viral, is_trending, trend_score")

The dataset has been saved as: processed_short_video_trends_2025.csv
Final feature quantity: 34
Target variable: is_viral, is_trending, trend_score


In [34]:
from sklearn.ensemble import RandomForestClassifier
# Analyzing Feature Importance Using Random Forest
if 'is_viral' in df_scaled.columns:
    X = df_scaled[available_features].astype(float).fillna(0)
    y = df_scaled['is_viral']
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 important features:")
    print(feature_importance.head(10))

Top 10 important features:
                      feature  importance
10  creator_performance_ratio    0.368054
4             engagement_rate    0.188918
5         engagement_per_view    0.131362
8          engagement_quality    0.113077
6              virality_score    0.067167
19             trend_momentum    0.040703
7            completion_ratio    0.013577
0                duration_sec    0.010492
1                title_length    0.008885
12                upload_hour    0.007561


In [35]:
import pandas as pd
df.to_csv('C:/Users/hp/Desktop/.Data Prepocessing.csv', index=False) 