In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("skylab_instagram_datathon_dataset.csv", sep=";")
data.drop(columns=['period', 'calculation_type'], inplace=True)
data = data.sort_values(by="period_end_date")
data.drop(columns=['compset'], inplace=True)
data.drop_duplicates(inplace=True)

# Extra Statistics
We look into the possibility to combine couple of metrics and make a more general metric. Some ideas like "likes per picture" was added to see whether they can help to interpret the data.

In [3]:
# Static statistics - only taking into account the time right now
data['likes_per_picture'] = data['likes'] / data['pictures']
data['likes_per_follower'] = data['likes'] / data['followers']
data['likes_per_post'] = data['likes'] / (data['videos'] + data['pictures'])
data['comments_per_post'] = data['comments'] / (data['videos'] + data['pictures'])

In [5]:
statistics = ["followers", "pictures", "videos", "comments", "likes", "likes_per_picture", 
              "likes_per_follower", "likes_per_post", "comments_per_post"]

# Sector Trends

In [6]:
data['compset_group'].unique()

array(['Apparel Retail', 'Luxury & Premium & Mainstream', 'Food Products',
       'Petcare', 'Sportswear & Athleisure', 'Beverages', 'Restaurants',
       'Food Retail', 'Beauty & Boutique', 'Sporting Goods', 'Mattress',
       'Home Appliances', 'Building Products', 'Fitness & Exercise',
       'Dermatology and Orthodontics', 'Discount Retailers ',
       'Outdoor Gear', 'Toys & Collectibles', 'Study (All Brands)',
       'Entertainment'], dtype=object)

## All brands sector is just all the brands added together

In [7]:
data[lambda x: x['compset_group'] == "Study (All Brands)"].head()

Unnamed: 0,period_end_date,compset_group,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,followers,pictures,videos,comments,likes,likes_per_picture,likes_per_follower,likes_per_post,comments_per_post
386433,2015-01-03,Study (All Brands),All Brands,,,,,171794702.0,29517.0,1278.0,1890131.0,140808602.0,4770.423891,0.819633,4572.450138,61.377854
375635,2015-01-10,Study (All Brands),All Brands,,,,,175965438.0,29177.0,1133.0,2000976.0,147184135.0,5044.525997,0.836438,4855.959584,66.017024
378927,2015-01-17,Study (All Brands),All Brands,,,,,183957122.0,29597.0,1073.0,2125215.0,154957321.0,5235.575261,0.842356,5052.406945,69.292957
368625,2015-01-24,Study (All Brands),All Brands,,,,,187581498.0,31722.0,1080.0,2353339.0,165751346.0,5225.122817,0.883623,5053.08658,71.743766
374750,2015-01-31,Study (All Brands),All Brands,,,,,191216433.0,34045.0,1200.0,2514746.0,174657597.0,5130.198179,0.913403,4955.528359,71.350433


## Sector statistics
We looked into sectors to see if we have big difference in likes, comments, ... in different sectors. We found out that some sectors are doing really well in social media and the conclusion is that if we want to leverage Instagram data for investments, we need to look in these top sectors. For other sectors, the dataset might will be so useful.

In [63]:
gp = data.groupby(by=["compset_group","period_end_date"]).sum()
gp


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,Unnamed: 1_level_0,followers,pictures,videos,comments,likes,likes_per_picture,likes_per_follower,likes_per_post,comments_per_post
compset_group,period_end_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Apparel Retail,2015-01-03,360626147.0,45174.0,1468.0,3787532.0,319272826.0,630835.894769,42.005795,258692.951978,3741.170510
Apparel Retail,2015-01-10,368888197.0,44542.0,1296.0,3936798.0,338391270.0,616783.115554,43.857387,358722.932082,4255.463460
Apparel Retail,2015-01-17,376460955.0,44331.0,1208.0,4162301.0,354260892.0,,45.276428,383892.546862,4596.845317
Apparel Retail,2015-01-24,382612963.0,46816.0,1149.0,4671392.0,378254952.0,,47.038984,394694.021229,4883.177382
Apparel Retail,2015-01-31,389399029.0,49483.0,1209.0,4988392.0,395452206.0,,48.923475,410337.190463,5284.837869
...,...,...,...,...,...,...,...,...,...,...
Toys & Collectibles,2023-08-19,38430922.0,3284.0,1268.0,118544.0,7743198.0,64219.088644,3.527807,53228.470725,322.916041
Toys & Collectibles,2023-08-26,38593496.0,3334.0,1286.0,106474.0,8273532.0,67579.363914,3.711328,56819.447203,494.520341
Toys & Collectibles,2023-09-02,38738530.0,3370.0,1318.0,109042.0,8523852.0,67739.011592,3.318133,57706.504195,531.467591
Toys & Collectibles,2023-09-09,38845196.0,3290.0,1286.0,112038.0,9618242.0,76926.574752,3.655689,65377.450832,570.884575


In [64]:
pd.DataFrame(gp).loc['Apparel Retail']

Unnamed: 0_level_0,followers,pictures,videos,comments,likes,likes_per_picture,likes_per_follower,likes_per_post,comments_per_post
period_end_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-03,3.606261e+08,45174.0,1468.0,3787532.0,319272826.0,630835.894769,42.005795,258692.951978,3741.170510
2015-01-10,3.688882e+08,44542.0,1296.0,3936798.0,338391270.0,616783.115554,43.857387,358722.932082,4255.463460
2015-01-17,3.764610e+08,44331.0,1208.0,4162301.0,354260892.0,,45.276428,383892.546862,4596.845317
2015-01-24,3.826130e+08,46816.0,1149.0,4671392.0,378254952.0,,47.038984,394694.021229,4883.177382
2015-01-31,3.893990e+08,49483.0,1209.0,4988392.0,395452206.0,,48.923475,410337.190463,5284.837869
...,...,...,...,...,...,...,...,...,...
2023-08-19,5.230879e+09,67612.0,32946.0,5970863.0,248958954.0,511769.367039,6.020558,226287.651739,6816.118748
2023-08-26,5.233102e+09,67525.0,33319.0,6258199.0,251538399.0,482969.594718,6.138442,225139.772758,6699.666250
2023-09-02,5.238389e+09,66890.0,34440.0,6886325.0,263198851.0,505411.366905,6.618715,232306.955012,7367.843668
2023-09-09,5.243752e+09,67026.0,35104.0,7107169.0,289904995.0,534707.998568,6.721135,240563.521465,7591.810117


In [68]:
# in each sector at each time stamp, sum all the statistics

col = 'likes'
traces = []

for secname in data['compset_group'].unique():
    print(secname)
    sector = pd.DataFrame(gp).loc[secname]
    sector.reset_index(drop=True)

    trace = go.Scatter(
        x=sector.index,
        y=sector[col],
        mode='lines+markers',
        name=secname  # Use column name as trace name
    )
    traces.append(trace)

    # Create layout
    layout = go.Layout(
        title='Sector comparison',
        xaxis=dict(title='time'),
        yaxis=dict(title=col)
    )

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()


Apparel Retail
Luxury & Premium & Mainstream
Food Products
Petcare
Sportswear & Athleisure
Beverages
Restaurants
Food Retail
Beauty & Boutique
Sporting Goods
Mattress
Home Appliances
Building Products
Fitness & Exercise
Dermatology and Orthodontics
Discount Retailers 
Outdoor Gear
Toys & Collectibles
Study (All Brands)
Entertainment


In [66]:
# TODO average likes, comments, ... and rank the sectors

In [70]:
# dig deeper into the sector and rank the companies
col = 'likes'

secname = 'Apparel Retail'
secdf = data[data['compset_group'] == secname]

highfeat = data.groupby(by='business_entity_doing_business_as_name').mean()
highfeat.reset_index(inplace=True)
highfeat = highfeat.sort_values(by=col)
highfeat = highfeat[highfeat[col] > 1e6].copy()
corp_names = [x for x in highfeat['business_entity_doing_business_as_name'].unique() if x!='All Brands']
# corp_names = ['SHEIN']
display(highfeat)

traces = []
print(len(corp_names))
for ent in corp_names:
    print(ent)
    corp = secdf[secdf['business_entity_doing_business_as_name'] == ent].copy()
    corp = corp.sort_values(by='period_end_date')
    corp.reset_index(drop=True, inplace=True)

    trace = go.Scatter(
        x=corp['period_end_date'],
        y=corp[col],
        mode='lines+markers',
        name=ent  # Use column name as trace name
    )
    traces.append(trace)

    # Create layout
    layout = go.Layout(
        title='Companies in a sector',
        xaxis=dict(title='time'),
        yaxis=dict(title=col)
    )


# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()



The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,business_entity_doing_business_as_name,followers,pictures,videos,comments,likes,likes_per_picture,likes_per_follower,likes_per_post,comments_per_post
418,Nasty Gal,3.531968e+06,57.373626,5.569231,81807.127473,1.071172e+06,17232.968797,0.362374,16024.233244,1113.360326
186,Dior Beauty,6.338262e+06,42.225888,27.966921,9675.522843,1.077916e+06,28278.071665,0.258292,15729.652796,138.508964
135,Champs Sports,1.889724e+06,207.764835,28.705495,9621.039560,1.078860e+06,5684.184839,0.749315,5030.090518,50.043091
48,Balenciaga,8.077196e+06,36.281095,1.555831,8907.828784,1.089741e+06,,0.129910,48538.412235,411.238023
331,L'Oreal Paris,6.324251e+06,108.262693,20.929204,8127.856512,1.093864e+06,9325.487405,0.503146,7586.695979,68.792989
...,...,...,...,...,...,...,...,...,...,...
328,Kylie Cosmetics,1.775205e+07,80.158809,12.811414,109437.287841,1.338273e+07,150317.355639,1.004685,130079.299406,1019.934341
672,Victoria's Secret,6.869237e+07,547.903297,106.019824,71452.402198,1.565459e+07,46682.396968,0.241381,40092.937902,174.587017
228,Fashion Nova,2.834515e+07,1001.736264,67.157428,162617.195604,1.950585e+07,19062.468733,0.362636,18451.515800,158.249997
19,All Brands,2.079877e+08,5786.791429,1004.965938,504938.533418,3.657754e+07,,1.118460,4466.017749,60.183450


84
Nasty Gal
Dior Beauty
Champs Sports
Balenciaga
L'Oreal Paris
Bulgari
Billabong
Bershka
Rolex
Hublot
Dr. Martens
e.l.f.
Christian Louboutin
Under Armour
Alo Yoga
Balmain
REVOLVE
Monster Energy
Savage X Fenty
Armani
Burberry
Anthropologie
Tommy Hilfiger
McDonald's
Fendi
Tiffany & Co.
Missguided
Pandora
Topshop
Alexander McQueen
Supreme
Michael Kors
Bath & Body Works
Ralph Lauren
Valentino
Daniel Wellington
Aeropostale
Hollister
Prada
Boohoo
Zara
Gymshark
Urban Decay Cosmetics
Asos
JD Sports
Funko
Maybelline
Puma
Tarte Cosmetics
Chanel
Off-White
Too Faced
Calvin Klein
Dolce & Gabbana
Versace
Pretty Little Thing
Primark
Benefit
Morphe
Starwars
NYX
ColourPop
Starbucks
Vans
H&M
Louis Vuitton
Sephora
MAC Cosmetics
Forever 21
Foot Locker
Dior
Gucci
Fenty Beauty
SHEIN
Red Bull
Urban Outfitters
Anastasia Beverly Hills
Nike
Adidas
Disney
Kylie Cosmetics
Victoria's Secret
Fashion Nova
MARVEL
