## 1. Imports
All necessary libraries are imported here.

In [35]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50

## 2. Data Loading
Load the dataset from the SQLite database.

In [36]:
with sqlite3.connect('tiktok.db') as conn:
    df = pd.read_sql_query("SELECT * FROM tiktok_data", conn)

print("Data loaded successfully. First 5 rows:")
df.head()

Data loaded successfully. First 5 rows:


Unnamed: 0,Rank,Username,Country,Followers,Views,Likes,Engagement,Brand Account,Gender,Age,Ethnicity,Famous,Genre,LGBTQ
0,1,@charlidamelio,U.S.A,78.9m,38.3m,6.1m,16.60%,0,Female,16.0,White,0.0,"Dancing, Lipsyncing, Lifestyle",0.0
1,2,@addisonre,U.S.A,53.7m,13.8m,2.8m,20.80%,0,Female,19.0,White,0.0,"Dancing, Lipsyncing",0.0
2,3,@zachking,U.S.A,47.3m,17.6m,2.8m,16%,0,Male,30.0,East Asian,1.0,"Comedy, Illusion",0.0
3,4,@lorengray,U.S.A,46.3m,3.2m,622.6k,19.70%,0,Female,18.0,White,1.0,"Dancing, Lipsyncing",0.0
4,5,@tiktok,U.S.A,45.3m,9.4m,503.0k,5.60%,1,,,,1.0,,


## 3. Data Cleaning
This cell handles all data cleaning tasks:
1. Defines a helper function to convert string numbers (e.g., '1.2m') to numeric.
2. Standardizes column names.
3. Converts numeric-like columns from text to actual numbers. This is done safely to prevent errors on re-runs.


In [None]:
def convert_to_number(x):
    if isinstance(x, str):
        x = x.lower().replace(',', '')
        if 'k' in x:
            return float(x.replace('k', '')) * 1_000
        elif 'm' in x:
            return float(x.replace('m', '')) * 1_000_000
    return float(x)

df.columns = df.columns.str.lower().str.strip()


for col in ['likes', 'followers', 'views']:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(convert_to_number)

if df['engagement'].dtype == 'object':
    df['engagement'] = df['engagement'].str.replace('%', '').astype(float) / 100.0

print("Initial data cleaning and type conversion complete.")
df.info()

Initial data cleaning and type conversion complete.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rank           256 non-null    int64  
 1   username       256 non-null    object 
 2   country        256 non-null    object 
 3   followers      256 non-null    float64
 4   views          256 non-null    float64
 5   likes          256 non-null    float64
 6   engagement     256 non-null    float64
 7   brand account  256 non-null    int64  
 8   gender         232 non-null    object 
 9   age            225 non-null    float64
 10  ethnicity      242 non-null    object 
 11  famous         255 non-null    float64
 12  genre          254 non-null    object 
 13  lgbtq          239 non-null    float64
dtypes: float64(7), int64(2), object(5)
memory usage: 28.1+ KB


## 4. Feature Engineering & Data Splitting & Imputation
This cell prepares the data for modeling:
1. Uses `KNNImputer` to intelligently fill missing numeric values (like `age`).
2. Uses `SimpleImputer` to fill missing categorical values with the most frequent one.
3. Creates the `age_group` feature.
4. Defines the feature matrix `X` and target vector `y`.
5. Performs advanced one-hot encoding for the multi-label `genre` column.
6. Performs standard one-hot encoding for all other categorical columns.
7. Splits the final dataset into training and testing sets.

In [39]:
x = df.drop('likes', axis=1)
y = df['likes']

num_cols = x.select_dtypes(include=np.number).columns
cat_cols = x.select_dtypes(include=['object']).drop('username', axis=1).columns

knn_imputer = KNNImputer(n_neighbors=5)
df[num_cols] = pd.DataFrame(knn_imputer.fit_transform(df[num_cols]), columns=num_cols)

cat_imputer = SimpleImputer(strategy='most_frequent')
x[cat_cols] = cat_imputer.fit_transform(x[cat_cols])

print("Imputation complete on the feature set (X).")
# 1. Feature Engineering
bins = [0, 18, 40, 60, 100]
labels = ['young', 'adult', 'middle-aged', 'senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# 2. Define Features (X) and Target (y)
X = df.drop(['likes', 'username'], axis=1)
y = df['likes']

# 3. Advanced Categorical Encoding
if 'genre' in X.columns:
    genres = X['genre'].str.get_dummies(sep=',')
    X = X.drop('genre', axis=1)
    X = pd.concat([X, genres], axis=1)

X_encoded = pd.get_dummies(X, drop_first=True)

# 4. Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print("Feature engineering and splitting complete.")
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Imputation complete on the feature set (X).
Feature engineering and splitting complete.
Training data shape: (204, 93)
Test data shape: (52, 93)


## 5. Model Training and Evaluation
We train our best-performing model (default XGBoost) and evaluate its performance.

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)

print("--- XGBoost Model Performance ---")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred_xgb):,.2f}")
print(f"R2 score: {r2_score(y_test, y_pred_xgb) * 100:.2f}%")

--- XGBoost Model Performance ---
Mean Absolute Error: 77,474.48
R2 score: 94.50%


In [40]:
from sklearn.model_selection import cross_val_score
import numpy as np

xgb_model_for_cv = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

cv_scores = cross_val_score(xgb_model_for_cv, X_encoded, y, cv=10, scoring='r2')
cv_scores_percent = cv_scores * 100

print(f"All 10 cross-vlidation R2 scores (%):\\n {np.round(cv_scores_percent, 2)}")
print("---")
print(f"Average R2 score: {cv_scores_percent.mean():.2f}%")
print(f"Standard deviation : {cv_scores_percent.std():.2f}%")

All 10 cross-vlidation R2 scores (%):\n [79.6  96.04 78.93 99.46 96.07 98.71 98.21 90.98 75.1  99.51]
---
Average R2 score: 91.26%
Standard deviation : 9.13%
