In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score

Step 0: load our data

In [3]:
# Load data
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MSc thesis/processed_data/metadata/fully_concatenated_data.csv", lineterminator='\n')

In [4]:
data.iloc[:, 10:35]

Unnamed: 0,caption,is_ad,Post ID,Sponsorship label,JSON file,Image files,Name,Followers,Followees,Posts,...,Year,Month,Day,Hour,Day of Week (Number),Day of Week (String),Is_English,Detected_Language,Image_Count,image_0
0,Preseason game 2 outfit.Vintage hat (which I t...,,184752,0,1354915397698488656.json,1354915397698488656.jpg,Rocketgirl,6453,710,1059,...,2016,10,6,7,3,Thursday,True,en,1,0.483804
1,Took Heidi to meet Santa today at @petsmart. S...,,184764,0,1666974101448132749.json,1666974101448132749.jpg,Rocketgirl,6453,710,1059,...,2017,12,10,20,6,Sunday,True,en,1,1.257535
2,4/18/18- Game 2 Rockets vs Wolves. Light up ha...,,184776,0,1768273356099895999.json,1768273356099895999.jpg,Rocketgirl,6453,710,1059,...,2018,4,29,15,6,Sunday,True,en,1,1.015909
3,4/18/18- Game 2 vs T-Wolves pics. Section 107 ...,False,184777,0,1770557472379860655.json,1770557376598782147.jpg,Rocketgirl,6453,710,1059,...,2018,5,2,18,2,Wednesday,True,en,7,0.612580
4,Season opener outfit. The left is kind of what...,,184780,0,1899586748141399903.json,1899586748141399903.jpg,Rocketgirl,6453,710,1059,...,2018,10,27,19,5,Saturday,True,en,1,0.961807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196174,Making 2018 my epic skin year. I’ve locked dow...,False,1519643,0,1689162745453535364.json,1689162599693269884.jpg,Z o e,4487,858,713,...,2018,1,10,11,2,Wednesday,True,en,2,0.160993
196175,The perfect storm ☁️ wearing @boohoo rib knit ...,,1519644,0,1695128267039160652.json,1695128267039160652.jpg,Z o e,4487,858,713,...,2018,1,18,17,3,Thursday,True,en,1,0.337574
196176,"Of all the Christmas movies, I relate to the G...",,1519650,0,1926097425697935397.json,1926097425697935397.jpg,Z o e,4487,858,713,...,2018,12,3,9,0,Monday,True,en,1,0.860733
196177,Werbung\nGuten Morgen ihr Lieben. Ich gehe mal...,,1403159,0,1717335338709878476.json,1717335338709878476.jpg,Zeynep,19215,401,149,...,2018,2,18,8,6,Sunday,False,de,1,0.039804


In [None]:
# List of image and text feature columns to keep
image_columns = [f'image_{i}' for i in range(0, 2049)]
text_columns = [f'text_{i}' for i in range(0, 768)]

# Additional columns you want to keep
additional_columns_to_keep = ['like_count', 'comment_count', 'post_id', 'Followers', 'Username', 'Followees', 'Posts',
                              'Year', 'Month', 'Day of Week (String)', 'Hour', 'Language', 'Is_English', 'Image_Count', 'Sponsorship label', 'is_video', 'Image_Count']

# Combine both lists of columns to keep
columns_to_keep = additional_columns_to_keep #+ text_columns + image_columns

# Drop all columns except those specified in columns_to_keep
columns_to_drop = [col for col in data.columns if col not in columns_to_keep]

# Drop the unwanted columns from the dataframe
data_2 = data.drop(columns=columns_to_drop)


# Maybe don't delete category and detected languages
# Same with comments disabled and video
# potential columns to add or not:
#'comments_disabled',
#'is_video'
# Category
data_2.head(10)

# Filter image count to 10 or less, which is what is realistic in the industry
data_2 = data_2[data_2['Image_Count'] <= 1]

In [None]:
data_2.head(10)

Unnamed: 0,Username,post_id,like_count,comment_count,is_video,Sponsorship label,Followers,Followees,Posts,Year,Month,Hour,Day of Week (String),Is_English,Image_Count
0,00_rocketgirl,1354915397698488656,293,6,False,0,6453,710,1059,2016,10,7,Thursday,True,1
1,00_rocketgirl,1666974101448132749,73,2,False,0,6453,710,1059,2017,12,20,Sunday,True,1
2,00_rocketgirl,1768273356099895999,164,4,False,0,6453,710,1059,2018,4,15,Sunday,True,1
4,00_rocketgirl,1899586748141399903,116,2,False,0,6453,710,1059,2018,10,19,Saturday,True,1
9,0hkris,1514786978688118458,269,0,False,0,12630,7069,261,2017,5,21,Sunday,True,1
10,0hkris,1837331879491417999,1065,19,False,0,12630,7069,261,2018,8,21,Thursday,True,1
11,0hkris,1840089921320547693,1242,7,False,0,12630,7069,261,2018,8,17,Monday,True,1
12,0hkris,1847365723795798774,1346,4,False,0,12630,7069,261,2018,8,18,Thursday,True,1
13,0ldisme,1653111532051112326,1139,28,False,0,18583,523,487,2017,11,17,Tuesday,True,1
14,0ldisme,1708204647056992205,1352,39,False,0,18583,523,487,2018,2,18,Monday,True,1


Numerical columns

In [None]:
# Specify the columns you want to include in the descriptive statistics
columns_of_interest = ['like_count', 'comment_count', 'Followers', 'Followees', 'Posts']

# Generate descriptive statistics for these columns
descriptive_stats = data_2[columns_of_interest].describe()

In [None]:
latex_table = descriptive_stats.to_latex(float_format="%.0f")

In [None]:
print(latex_table)

\begin{tabular}{lrrrrr}
\toprule
 & like_count & comment_count & Followers & Followees & Posts \\
\midrule
count & 133642 & 133642 & 133642 & 133642 & 133642 \\
mean & 3926 & 75 & 138656 & 1582 & 1269 \\
std & 34946 & 917 & 1340448 & 3462 & 1687 \\
min & 0 & 0 & 198 & 0 & 91 \\
25% & 206 & 7 & 7811 & 593 & 469 \\
50% & 612 & 24 & 20973 & 995 & 846 \\
75% & 1720 & 60 & 62617 & 1881 & 1545 \\
max & 3850463 & 169030 & 119050781 & 304758 & 127520 \\
\bottomrule
\end{tabular}



Time columns