In [1]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:


with sqlite3.connect('tiktok.db') as conn:
    df = pd.read_sql_query("SELECT * FROM tiktok_data", conn)
    
print(df.head())

   Rank        Username Country Followers  Views   Likes Engagement  \
0     1  @charlidamelio   U.S.A     78.9m  38.3m    6.1m     16.60%   
1     2      @addisonre   U.S.A     53.7m  13.8m    2.8m     20.80%   
2     3       @zachking   U.S.A     47.3m  17.6m    2.8m        16%   
3     4      @lorengray   U.S.A     46.3m   3.2m  622.6k     19.70%   
4     5         @tiktok   U.S.A     45.3m   9.4m  503.0k      5.60%   

   Brand Account  Gender   Age   Ethnicity  Famous  \
0              0  Female  16.0       White     0.0   
1              0  Female  19.0       White     0.0   
2              0    Male  30.0  East Asian     1.0   
3              0  Female  18.0       White     1.0   
4              1    None   NaN        None     1.0   

                            Genre  LGBTQ  
0  Dancing, Lipsyncing, Lifestyle    0.0  
1             Dancing, Lipsyncing    0.0  
2                Comedy, Illusion    0.0  
3             Dancing, Lipsyncing    0.0  
4                            None

In [3]:
print(df.columns)

Index(['Rank', 'Username', 'Country', 'Followers', 'Views', 'Likes',
       'Engagement', 'Brand Account', 'Gender', 'Age', 'Ethnicity', 'Famous',
       'Genre', 'LGBTQ'],
      dtype='object')


In [4]:
df.columns = df.columns.str.lower().str.strip()
print(df.columns)

Index(['rank', 'username', 'country', 'followers', 'views', 'likes',
       'engagement', 'brand account', 'gender', 'age', 'ethnicity', 'famous',
       'genre', 'lgbtq'],
      dtype='object')


In [49]:
num_cols = df.select_dtypes(include=['int64', 'float']).columns
print(df[num_cols].skew())

rank             0.000000
followers        3.172300
likes            3.940735
brand account    3.781035
age              2.431823
famous           0.047165
lgbtq            4.786310
dtype: float64


In [9]:
from sklearn.impute import SimpleImputer

In [10]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [18]:
df['ethnicity'] = df['ethnicity'].fillna('multiple')
df['gender'] = df['gender'].fillna('not_specific')

print(df['ethnicity'].value_counts())
print(df['gender'].value_counts())

ethnicity
White              105
South Asian         62
Latinx              41
Black               15
multiple            14
East Asian           8
Southeast Asian      7
Middle Eastern       4
Name: count, dtype: int64
gender
Male            133
Female           99
not_specific     24
Name: count, dtype: int64


In [20]:
df['genre'] = df['genre'].fillna('Unknown')

In [21]:
print(df.isna().sum())

rank             0
username         0
country          0
followers        0
views            0
likes            0
engagement       0
brand account    0
gender           0
age              0
ethnicity        0
famous           0
genre            0
lgbtq            0
dtype: int64


In [None]:
df['gender'].value_counts(dropna=False)

gender
Male            133
Female           99
not_specific     24
Name: count, dtype: int64

In [23]:
df[df['gender'].isna()]

Unnamed: 0,rank,username,country,followers,views,likes,engagement,brand account,gender,age,ethnicity,famous,genre,lgbtq


In [57]:
bins = [0, 18, 40, 60, 100]
labels = ['young', 'adult', 'middle-aged', 'senior']

df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

print(df)

      rank           username country   followers   views      likes  \
0      1.0     @charlidamelio   U.S.A  78900000.0   38.3m  6100000.0   
1      2.0         @addisonre   U.S.A  53700000.0   13.8m  2800000.0   
2      3.0          @zachking   U.S.A  47300000.0   17.6m  2800000.0   
3      4.0         @lorengray   U.S.A  46300000.0    3.2m   622600.0   
4      5.0            @tiktok   U.S.A  45300000.0    9.4m   503000.0   
..     ...                ...     ...         ...     ...        ...   
251  252.0  @ramneeksingh1313   India   8600000.0  479.1k    31600.0   
252  253.0       @amandacerny   U.S.A   8600000.0  654.7k    95200.0   
253  254.0        @rahimabram  Russia   8600000.0    1.6m   271200.0   
254  255.0     @johnnyorlando   U.S.A   8600000.0    1.1m   245400.0   
255  256.0            @sarati   U.S.A   8600000.0  459.5k    90400.0   

    engagement  brand account        gender   age    ethnicity  famous  \
0       16.60%            0.0        Female  16.0        Whit

In [None]:
def convert_to_number(x):
    if isinstance(x, str):
        x = x.lower().replace(',','')
        if 'k' in x:
            return float(x.replace('k','')) * 1_000
        elif 'm' in x:
            return float(x.replace('m','')) * 1_000_000
        else:
            return float(x)
    return x

df['likes'] = df['likes'].apply(convert_to_number)
df['followers'] = df['followers'].apply(convert_to_number)

In [29]:
print(df['likes'].head())
print(df['likes'].dtype)

0    6100000.0
1    2800000.0
2    2800000.0
3     622600.0
4     503000.0
Name: likes, dtype: float64
float64


In [68]:
df['engagement'] = df['engagement'].str.replace('%', '').astype(float)/100.0
df['views'] = df['views'].apply(convert_to_number)

In [70]:
x = df[['rank','country','engagement','followers','ethnicity','gender','famous','genre', 'views', 'age_group','lgbtq']]
y = df['likes']

genres = x['genre'].str.get_dummies(sep=',')

x = x.drop('genre', axis=1)
x = pd.concat([x,genres], axis=1)

x_encoded = pd.get_dummies(x, drop_first=True)

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
from sklearn.ensemble import RandomForestRegressor

rfmodel = RandomForestRegressor(n_estimators=100, random_state=42)
rfmodel.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [67]:
from sklearn.model_selection import RandomizedSearchCV

param_dist= {
    'n_estimators': [100,200,300,400,500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}

rfmodel = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(
    estimator=rfmodel,
    param_distributions=param_dist,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=1
)

rf_random.fit(x_train, y_train)
print("Best parameters found:", rf_random.best_params_)

best_rf = rf_random.best_estimator_

y_pred = best_rf.predict(x_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R2 score", r2_score(y_test, y_pred)*100)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=None, max_features=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_split=10, n_estimators=200; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_split=10, n_estimators=200; total time=   0.2s
[CV] END max_depth=20, max_features=log2, min_samples_split=10, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=None, min_samples_split=2, n_estimators=500; total time=   0.6s
[CV] END max_depth=10, max_features=None, min_samples_split=2, n_estimators=500; total time=   0.6s
[CV] END max_depth=10, max_features=None, min_samples_split=2, n_estimators=500; total time=   0.6s
[CV] END max_depth=20, max_f

In [48]:
y_pred = linear_model.predict(x_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R2 score", r2_score(y_test, y_pred)*100)

Mean Absolute Error: 561187.9290665983
R2 score 8.686814369599382


In [66]:
y_pred = rfmodel.predict(x_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R2 score", r2_score(y_test, y_pred)*100)

Mean Absolute Error: 400484.76923076925
R2 score 16.403382542344826


In [None]:
new_data = pd.DataFrame({
    'rank': [25],
    'country': ['Kenya'],
    'engagement': [0.085],
    'followers': [45000],
    'ethnicity': ['African'],
    'gender': ['Female'],
    'famous': ['Yes'],       
    'genre': ['Dance'],
    'lgbtq': ['No']           
})

new_data_encoded = pd.get_dummies(new_data, drop_first=True)
new_data_encoded = new_data_encoded.reindex(columns=x_encoded.columns, fill_value=0)

predicted_likes = int(linear_model.predict(new_data_encoded)[0])
print ("predicted Likes:", predicted_likes)

predicted Likes: -850919


In [46]:
new_data = pd.DataFrame({
    'rank': [25],
    'country': ['Kenya'],
    'engagement': [0.085],
    'followers': [45000],
    'ethnicity': ['African'],
    'gender': ['Female'],
    'famous': ['Yes'],       
    'genre': ['Dance'],
    'lgbtq': ['No']           
})

new_data_encoded = pd.get_dummies(new_data, drop_first=True)
new_data_encoded = new_data_encoded.reindex(columns=x_encoded.columns, fill_value=0)

predicted_likes = int(rfmodel.predict(new_data_encoded)[0])
print ("predicted Likes:", predicted_likes)

predicted Likes: 245670


In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

scaler =StandardScaler()
x_scaled = scaler.fit_transform(x_encoded)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred)*100)

MAE: 420032.6923076923
R2: 11.56020343838845


In [51]:
new_data = pd.DataFrame({
    'rank': [25],
    'country': ['Kenya'],
    'engagement': [0.085],
    'followers': [45000],
    'ethnicity': ['African'],
    'gender': ['Female'],
    'famous': ['Yes'],       
    'genre': ['Dance'],
    'lgbtq': ['No']           
})

new_data_encoded = pd.get_dummies(new_data, drop_first=True)
new_data_encoded = new_data_encoded.reindex(columns=x_encoded.columns, fill_value=0)

predicted_likes = int(knn.predict(new_data_encoded)[0])
print ("predicted Likes:", predicted_likes)

predicted Likes: 2565120




In [75]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

xgb_model.fit(x_train, y_train)

y_pred_xgb = xgb_model.predict(x_test)

print("XGBoost Model Performance:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_xgb))
print("R2 score:", r2_score(y_test, y_pred_xgb)*100)


XGBoost Model Performance:
Mean Absolute Error: 406443.7102050781
R2 score: 38.49594918396372


In [72]:
import xgboost as xgb

param_dist_xgb = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3, 4]
}

xgb_model_tuned = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_random = RandomizedSearchCV(
    estimator=xgb_model_tuned,
    param_distributions= param_dist_xgb,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=1
)


xgb_random.fit(x_train, y_train)
print("Best parametres found:", xgb_random.best_params_)

best_xgb = xgb_random.best_estimator_

y_pred_best_xgb = best_xgb.predict(x_test)

print("\nTuned XGBoost Model Performance:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_best_xgb))
print("R2 score:", r2_score(y_test, y_pred_best_xgb)*100)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=500, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=8, min_child_weight=4, n_estimators=500, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=8, min_child_weight=4, n_estimators=500, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.2, max_depth=8, min_child_weight=4, n_estimators=500, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_dep

In [78]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', rf_random.best_estimator_),
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
]

stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression(),
    cv=5
)
print('Training the stacking model...')
stacking_model.fit(x_train, y_train)

print("\nStacking Model Performance:")
y_pred_stacking= stacking_model.predict(x_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_stacking))
print("R2 score:", r2_score(y_test, y_pred_stacking) * 100)


Training the stacking model...

Stacking Model Performance:
Mean Absolute Error: 449457.6588483879
R2 score: 29.449800516048363
