# Imports and utils

In [None]:
import os
import pandas as pd
import plotly.express as px
import yaml

In [None]:
config_path = os.path.join('config.yaml')

with open(config_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    
data_path = cfg['path_influencers']
categories = cfg['image_categories']

# Read data

In [None]:
df = pd.read_csv(data_path, delimiter='\t', header=0, skiprows=[1])

# Basic info

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
fig1 = px.histogram(df, x='#Posts', title='Rozkład #Posts')
fig2 = px.histogram(df, x='#Followers', title='Rozkład #Followers')
fig3 = px.histogram(df, x='#Followees', title='Rozkład #Followees')

# Wyświetlanie wykresów
fig1.show()
fig2.show()
fig3.show()

In [None]:
df.corr(numeric_only=True)

In [None]:
counts = df['Category'].value_counts().reset_index()
counts.columns = ['Category', 'count']

# Tworzenie wykresu barplot w Plotly
fig = px.bar(counts, x='Category', y='count', title="Ilość wystąpień kategorii")
fig.show()

In [None]:
# ręczne sprawdzenie pokazało, że są to prawdziwe profile z błędną nazwą
df[~df['Category'].isin(categories)]


In [None]:
df = df[df['Category'].isin(categories)]

In [None]:
# Top 5 kont z największą ilością #Posts
top_posts = df.nlargest(5, '#Posts')
print("Top 5 kont z największą ilością #Posts:")
print(top_posts)



In [None]:
# Top 5 kont z największą ilością #Followers
top_followers = df.nlargest(5, '#Followers')
print("\nTop 5 kont z największą ilością #Followers:")
print(top_followers)

In [None]:
# Top 5 kont z największą ilością #Followees
top_followees = df.nlargest(5, '#Followees')
print("\nTop 5 kont z największą ilością #Followees:")
print(top_followees)

In [None]:
fig = px.box(df, x='Category', y='#Posts', title='Rozkład libczy postów u influencerów przypisanych do kategorii')
fig.show()

In [None]:
fig = px.box(df, x='Category', y='#Followers', title='Rozkład liczy Followers u influencerów przypisanych do kategorii')
fig.show()

In [None]:
fig = px.box(df, x='Category', y='#Followees', title='Rozkład liczby Followees u influencerów przypisanych do kategorii')
fig.show()

In [None]:
correlations_posts_followers = {}
correlations_posts_followees = {}
correlations_followers_followees = {}

for category in categories:
    subset = df[df['Category'] == category]
    corr_matrix = subset[['#Posts', '#Followers', '#Followees']].corr()
    correlations_posts_followers[category] = corr_matrix.loc['#Posts', '#Followers']
    correlations_posts_followees[category] = corr_matrix.loc['#Posts', '#Followees']
    correlations_followers_followees[category] = corr_matrix.loc['#Followers', '#Followees']

correlations_posts_followers = {k: v for k, v in sorted(correlations_posts_followers.items(), 
                                                        key=lambda item: item[1],
                                                        reverse=True)}
correlations_posts_followees = {k: v for k, v in sorted(correlations_posts_followees.items(), 
                                                        key=lambda item: item[1],
                                                        reverse=True)}
correlations_followers_followees = {k: v for k, v in sorted(correlations_followers_followees.items(), 
                                                            key=lambda item: item[1],
                                                            reverse=True)}

In [None]:
print("Correlation between #Posts and #Followers")
for category, correlation in correlations_posts_followers.items():
    print(f"Category {category}: {correlation:.2f}")

In [None]:
print("Correlation between #Posts and #Followees")
for category, correlation in correlations_posts_followees.items():
    print(f"Category {category}: {correlation:.2f}")

In [None]:
print("Correlation between #Followers and #Followees")
for category, correlation in correlations_followers_followees.items():
    print(f"Category {category}: {correlation:.2f}")