# Imports and utils

In [None]:
import os
import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import json
from collections import defaultdict
from tqdm import tqdm
import yaml

In [None]:
config_path = os.path.join('config.yaml')

with open(config_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    
data_path_post = cfg['path_metadata']
data_path_map = cfg['path_mapping']
data_path_influ = cfg['path_influencers']

In [None]:
def simple_check_column(df, column):
    print(f'Uniques: {df[column].unique()}')
    print(f"Nan: {df[column].isna().mean() * 100} %")
    

def edge_check_column(df, column):
    print(f"Head of column '{column}'\n {df[column].head()}")
    print(f"Nan: {df[column].isna().mean() * 100} %")
    # cost 99% time, can be refactored
    edge_df = pd.DataFrame(df[column].apply(pd.Series))
    print(f"Head of df based on column '{column}'\n {edge_df.head()}")
    significant_data = edge_df[~edge_df['edges'].isna()]
    print(f"Is every edge empty? {significant_data.apply(lambda x: len(x) == 0).all()}")
    if 'count' in significant_data.iloc[0].keys():
        fig = px.box(significant_data, x='count', orientation='h', title='Poziomy Boxplot')
        fig.show()
    return edge_df

def string_to_dict(input_string):
    return json.loads(input_string) if type(input_string) is str else {}


# Read data

In [None]:
# Lista, która będzie przechowywać dane z poszczególnych plików
data_list = []

# Iteracja przez pliki w folderze
for filename in tqdm(os.listdir(data_path_post), desc="Wczytywanie plików .info"):
    if filename.endswith('.info'):
        file_path = os.path.join(data_path_post, filename)

        # Wczytanie pliku JSON i dodanie do listy
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Dodanie słownika do listy
        data_list.append(json_data)

# Utworzenie DataFrame z listy słowników
posts_df = pd.DataFrame(data_list)

# czy to pomaga w notebookach?
del data_list


In [None]:
mapping_df = pd.read_csv(data_path_map, sep='\t')

In [None]:
influ_df = pd.read_csv(data_path_influ, delimiter='\t', header=0, skiprows=[1])

# Basic info

In [None]:
posts_df.columns

In [None]:
posts_df.head()

In [None]:
posts_df.info(verbose=False)

# Specific column analyze

## gating_info

In [None]:
simple_check_column(posts_df, 'gating_info')

## viewer_can_reshare

In [None]:
simple_check_column(posts_df, 'viewer_can_reshare')

## display_resources

In [None]:
f"Nan: {posts_df['display_resources'].apply(lambda x: len(x) == 0).sum().sum()} %"

In [None]:
posts_df['display_resources'].apply(lambda x: len(x) == 0).any()

In [None]:
posts_df['display_resources']

In [None]:
posts_df['display_resources'][1]

In [None]:
id_few_photos_post = '1765627446136050586'

In [None]:
posts_df[posts_df['id'] == '1765627446136050586']['display_resources'].iloc[0]

ręczne znalezienie id w JSON-Image_files_mapping.txt sugeruje wiele źródeł pobrania do jednego posta, ilość zdjęć nie ma wpływu

## viewer_in_photo_of_you 

In [None]:
simple_check_column(posts_df, 'viewer_in_photo_of_you')

## viewer_has_saved_to_collection

In [None]:
simple_check_column(posts_df, 'viewer_has_saved_to_collection')

## viewer_has_saved

In [None]:
simple_check_column(posts_df, 'viewer_has_saved')

## owner

### Basic

In [None]:
posts_df['owner'].head()

In [None]:
posts_df['owner'].iloc[0]

In [None]:
f"Nan: {posts_df['owner'].isna().mean() * 100} %"

In [None]:
owner_df = pd.DataFrame(posts_df['owner'].apply(pd.Series))
#owner_df = owner_df.drop_duplicates(subset=['id'])

### Depth

#### username

In [None]:
# Do usernames have an impact? Sentiment for username?

In [None]:
tmp = 0
a = []
for filename in tqdm(os.listdir(data_path_post), desc="Wczytywanie plików .info"):
    if filename.endswith('.info'):
        file_path = os.path.join(data_path_post, filename)

        # Wczytanie pliku JSON i dodanie do listy
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            if json_data['owner']['username'] != filename.split('-')[0]:
                #print(json_data['owner']['username'], filename.split('-')[0])
                #a = json_data
                #break
                tmp += 1
tmp
# sus behavior


#### is_unpublished

In [None]:
simple_check_column(owner_df, 'is_unpublished')

#### requested_by_viewer

In [None]:
simple_check_column(owner_df, 'requested_by_viewer')

#### followed_by_viewer

In [None]:
simple_check_column(owner_df, 'followed_by_viewer')

#### has_blocked_viewer

In [None]:
simple_check_column(owner_df, 'has_blocked_viewer')

#### profile_pic_url

In [None]:
print(f"Nan: {owner_df['profile_pic_url'].isna().mean() * 100} %")

In [None]:
owner_df['profile_pic_url']

#### full_name

In [None]:
print(f"Nan: {owner_df['full_name'].isna().mean() * 100} %")

#### blocked_by_viewer

In [None]:
simple_check_column(owner_df, 'blocked_by_viewer')

#### is_verified

In [None]:
simple_check_column(owner_df, 'is_verified')

In [None]:
percent_verified = owner_df['is_verified'].mean() * 100

print(f'Procent kont zweryfikowanych: {percent_verified:.2f}%')

In [None]:
fig = px.histogram(owner_df, x='is_verified', title='Liczba Wystąpień is_verified')

fig.show()

In [None]:
print(f"Has username always same status of is_verified: {owner_df.groupby('username')['is_verified'].nunique().eq(1).all()}")

In [None]:
usernames_df = owner_df.groupby('username')['is_verified'].max().reset_index()
usernames_df = usernames_df.merge(influ_df, left_on='username', right_on='Username', how='left')

In [None]:
# posts_df usernames dont match influ_df
usernames_df[usernames_df['Category'].isna()]

In [None]:
usernames_df = usernames_df[~usernames_df['Category'].isna()]
usernames_df.corr(numeric_only=True)

In [None]:
a['Followers_quantile'].apply(lambda x: str(int(x.left)) +', ' + str(int(x.right)))

In [None]:

a = usernames_df.sort_values(by='#Followers', ascending=True)
b = pd.qcut(a['#Followers'], q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
a['Followers_quantile'] = b.apply(lambda x: str(int(x.left)) +', ' + str(int(x.right)))
a.groupby(a['Followers_quantile'])['is_verified'].apply(lambda x: x.astype(float).mean()*100).reset_index()
a['Followers_quantile']

In [None]:
# 1. Barplot procent kont zweryfikowanych dla każdej kategorii
category_verification = usernames_df.groupby('Category')['is_verified'].mean().reset_index()

usernames_df = usernames_df.sort_values(by='#Followers', ascending=True)

# 2. Barplot dla każdego 10% kolejnych najwyższych wartości #Followers
followers_bins = pd.qcut(a['#Followers'], q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
usernames_df['Followers_quantile'] = followers_bins.apply(
    lambda x: f'{int(x.left)}, {int(x.right)}'
)
followers_verification = usernames_df.groupby(usernames_df['Followers_quantile'])['is_verified'].apply(lambda x: x.astype(float).mean()).reset_index()



# 3. Barplot dla każdego 10% kolejnych najwyższych wartości #Followees
followees_bins = pd.qcut(a['#Followees'], q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
usernames_df['Followees_quantile'] = followers_bins.apply(
    lambda x: f'{int(x.left)}, {int(x.right)}'
)
followees_verification = usernames_df.groupby(usernames_df['Followees_quantile'])['is_verified'].apply(lambda x: x.astype(float).mean()).reset_index()

# 4. Barplot dla każdego 10% kolejnych najwyższych wartości #Posts
posts_bins = pd.qcut(a['#Posts'], q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
usernames_df['Posts_quantile'] = posts_bins.apply(
    lambda x: f'{int(x.left)}, {int(x.right)}'
)
posts_verification = usernames_df.groupby(usernames_df['Posts_quantile'])['is_verified'].apply(lambda x: x.astype(float).mean()).reset_index()

# Tworzenie subplotów
fig = sp.make_subplots(rows=2, cols=2, subplot_titles=['Procent kont zweryfikowanych dla każdej kategorii', 
                                                    'Procent kont zweryfikowanych dla kolejnych 10% najwyższych #Followers',
                                                    'Procent kont zweryfikowanych dla kolejnych 10% najwyższych #Followees',
                                                    'Procent kont zweryfikowanych dla kolejnych 10% najwyższych #Posts'])

# Dodawanie wykresów do subplotów
fig.add_trace(go.Bar(x=category_verification['Category'], y=category_verification['is_verified'] * 100, name='Procent zweryfikowanych', ), 
              row=1, col=1)
fig.add_trace(go.Bar(x=followers_verification['Followers_quantile'], y=followers_verification['is_verified'] * 100, name='Procent zweryfikowanych'), 
              row=1, col=2)
fig.add_trace(go.Bar(x=followees_verification['Followees_quantile'], y=followees_verification['is_verified'] * 100, name='Procent zweryfikowanych'), 
              row=2, col=1)
fig.add_trace(go.Bar(x=posts_verification['Posts_quantile'], y=posts_verification['is_verified'] * 100, name='Procent zweryfikowanych'), 
              row=2, col=2)

fig.update_yaxes(title_text='Procenty', row=1, col=1)
fig.update_xaxes(title_text='Przedziały wartośći (kolejne kwantyle)', row=1, col=1)
fig.update_yaxes(title_text='Procenty', row=1, col=2)
fig.update_xaxes(title_text='Przedziały wartośći (kolejne kwantyle)', row=1, col=2)
fig.update_yaxes(title_text='Procenty', row=2, col=1)
fig.update_xaxes(title_text='Przedziały wartośći (kolejne kwantyle)', row=2, col=1)
fig.update_yaxes(title_text='Procenty', row=2, col=2)
fig.update_xaxes(title_text='Przedziały wartośći (kolejne kwantyle)', row=2, col=2)
# Aktualizacja układu subplotów
fig.update_layout(showlegend=False, title_text="Analiza Zweryfikowanych Kont w Social Media", height=700)

# Wyświetlenie wykresu
fig.show()

In [None]:
del usernames_df

#### id

In [None]:
print(f"Nan: {owner_df['id'].isna().mean() * 100} %")

In [None]:
percent_duplicates = owner_df['id'].duplicated().mean() * 100

print(f'Procent duplikatów w kolumnie \'id\': {percent_duplicates:.2f}%')

In [None]:
print(f"Avg post per user: {len(owner_df)/len(owner_df.drop_duplicates(subset=['id']))}")

#### is_private

In [None]:
simple_check_column(owner_df, 'is_private')

## viewer_has_liked

In [None]:
simple_check_column(posts_df, 'viewer_has_liked')

## id

In [None]:
posts_df['id'].duplicated().any()

In [None]:
f" Nan: {posts_df['id'].isna().mean() * 100} %"

## should_log_client_event

In [None]:
simple_check_column(posts_df, 'should_log_client_event')

## edge_media_preview_like

### Basic

In [None]:
media_preview_df = edge_check_column(posts_df, 'edge_media_preview_like')

### Depth

### Delete

In [None]:
del media_preview_df

## edge_media_to_tagged_user

### Basic

In [None]:
media_tagged_df = edge_check_column(posts_df, 'edge_media_to_tagged_user')

### Depth

### Delete

In [None]:
del media_tagged_df

## dimensions

In [None]:
posts_df['dimensions']

In [None]:
f"Nan: {posts_df['dimensions'].isna().mean() * 100} %"

In [None]:
posts_df['dimensions'].apply(lambda x: len(x) == 2).all()

In [None]:

# Uzyskaj min, max dla kolumn 'width' i 'height'
normalized_df = pd.json_normalize(posts_df['dimensions'])

# Uzyskaj min, max dla kolumn 'width' i 'height'
normalized_df.agg(['min', 'max'])

In [None]:
del normalized_df

## __typename

In [None]:
simple_check_column(posts_df, '__typename')

In [None]:
posts_df['__typename']

In [None]:
check_typename_df = posts_df[['id', '__typename']].copy()
check_typename_df['id'] = check_typename_df['id'].apply(lambda x: x+'.info')
check_typename_df = check_typename_df.merge(mapping_df, left_on='id',right_on='JSON_PostMetadata_file_name')

# if __typename == 'GraphImage' and Image_file_name hasnt only one photo
condition1 = (check_typename_df['__typename'] == 'GraphImage') & (check_typename_df['Image_file_name'].apply(lambda x: len(x.split(','))) != 1)

# if __typename == 'GraphSidecar' and Image_file_name has lestt than 2 photo
condition2 = (check_typename_df['__typename'] == 'GraphSidecar') & (check_typename_df['Image_file_name'].apply(lambda x: len(x.split(','))) < 2)

In [None]:
check_typename_df[condition1].head()

In [None]:
check_typename_df[condition2].head()

In [None]:
del check_typename_df 

## location

#### Basic

In [None]:
posts_df['location']

In [None]:
print(f"Nan: {posts_df['location'].isna().mean() * 100} %")

In [None]:
location_df = pd.json_normalize(posts_df['location'])
location_df['address_json'] = location_df['address_json'].apply(string_to_dict)

In [None]:
location_df.head()

In [None]:
location_df = pd.json_normalize(location_df['address_json'])


In [None]:
location_df.head()

#### Depth

In [None]:
location_df.info()

In [None]:
fig = px.bar(location_df['country_code'].value_counts().nlargest(50).reset_index(), x='index', y='country_code',
             labels={'index': 'Country Code', 'country_code': 'Count'},
             title='Top 50 Wystąpień Wartości w Kolumnie country_code')

# Wyświetl wykres
fig.show()

#### Delete

In [None]:
del location_df

## shortcode

In [None]:
posts_df['shortcode']

## is_ad

In [None]:
simple_check_column(posts_df, 'is_ad')

## caption_is_edited

In [None]:
simple_check_column(posts_df, 'caption_is_edited')

In [None]:
fig = px.bar(posts_df['caption_is_edited'].value_counts().reset_index(), x='index', y='caption_is_edited',
             title='Is caption edited?')

# Wyświetl wykres
fig.show()

## edge_media_to_parent_comment

### Basic

In [None]:
media_parent_df = edge_check_column(posts_df, 'edge_media_to_parent_comment')

### Depth

### Delete

In [None]:
del media_parent_df

## media_preview

In [None]:
simple_check_column(posts_df, 'media_preview')

## taken_at_timestamp

In [None]:
simple_check_column(posts_df, 'taken_at_timestamp')

In [None]:
posts_df['timestamp'] = pd.to_datetime(posts_df['taken_at_timestamp'], unit='s')

# Utwórz nową kolumnę 'year', która zawiera informacje o roku
posts_df['year'] = posts_df['timestamp'].dt.year
posts_df['month'] = posts_df['timestamp'].dt.month
posts_df['day'] = posts_df['timestamp'].dt.day
posts_df['hour'] = posts_df['timestamp'].dt.hour

In [None]:
histogram_data_year = posts_df['year'].value_counts().sort_index()
years = histogram_data_year.index
counts_year = histogram_data_year.values

histogram_data_month = posts_df['month'].value_counts().sort_index()
months = histogram_data_month.index
counts_month = histogram_data_month.values

histogram_data_day = posts_df['day'].value_counts().sort_index()
days = histogram_data_day.index
counts_day = histogram_data_day.values

histogram_data_day = posts_df['hour'].value_counts().sort_index()
hours = histogram_data_day.index
counts_hour = histogram_data_day.values

In [None]:
fig = sp.make_subplots(rows=2, cols=2, subplot_titles=('Częstość Wystąpień w Poszczególnych Latach', 'Częstość Wystąpień w Poszczególnych Miesiącach',
                                                      'Częstość Wystąpień w Poszczególnych Dniach', 'Częstość Wystąpień w Poszczególnych Godzinach'))

# Dodaj subplot dla lat
fig.add_trace(go.Bar(x=years, y=counts_year), row=1, col=1)
fig.update_xaxes(title_text='Rok', row=1, col=1)
fig.update_yaxes(title_text='Częstość', row=1, col=1)

# Dodaj subplot dla miesięcy
fig.add_trace(go.Bar(x=months, y=counts_month), row=1, col=2)
fig.update_xaxes(title_text='Miesiąc', row=1, col=2)
fig.update_yaxes(title_text='Częstość', row=1, col=2)

# Dodaj subplot dla dni
fig.add_trace(go.Bar(x=days, y=counts_day), row=2, col=1)
fig.update_xaxes(title_text='Dzień', row=2, col=1)
fig.update_yaxes(title_text='Częstość', row=2, col=1)

# Dodaj subplot dla godzin
fig.add_trace(go.Bar(x=hours, y=counts_hour), row=2, col=2)
fig.update_xaxes(title_text='Godzina', row=2, col=2)
fig.update_yaxes(title_text='Częstość', row=2, col=2)

# Zaktualizuj tytuł ogólny
fig.update_layout(title_text='Analiza Częstości Wystąpień w Poszczególnych Kategoriach')

# Wyświetl wykres
fig.show()

In [None]:
columns_to_remove = ['timestamp', 'year', 'month', 'day', 'hour']
posts_df.drop(columns=columns_to_remove, inplace=True)

## edge_media_to_caption

### Basic

In [None]:
media_caption_df = edge_check_column(posts_df, 'edge_media_to_caption')

### Depth

### Delete

In [None]:
del media_caption_df

## tracking_token

In [None]:
simple_check_column(posts_df, 'tracking_token')

## has_ranked_comments

In [None]:
simple_check_column(posts_df, 'has_ranked_comments')

## display_url

In [None]:
simple_check_column(posts_df, 'display_url')

## edge_web_media_to_related_media

### Basic

In [None]:
web_media_df = edge_check_column(posts_df, 'edge_web_media_to_related_media')

### Depth

### Delete

In [None]:
del web_media_df

## edge_media_preview_comment

### Basic

In [None]:
media_preview_df = edge_check_column(posts_df, 'edge_media_preview_comment')

### Depth

### Delete

In [None]:
del media_preview_df

## comments_disabled

In [None]:
simple_check_column(posts_df, 'comments_disabled')

In [None]:
percentage_true = (posts_df['comments_disabled'] == True).mean() * 100
print(f'Comments disbled: {percentage_true:.2f}%')

## edge_media_to_sponsor_user

### Basic

In [None]:
media_sponsor_df = edge_check_column(posts_df, 'edge_media_to_sponsor_user')

### Depth

### Delete

In [None]:
del media_sponsor_df

## accessibility_caption

In [None]:
simple_check_column(posts_df, 'accessibility_caption')

In [None]:
counts_dict = defaultdict(int)

# Zlicz ilość wystąpień każdej unikalnej wartości
for access_caption in posts_df['accessibility_caption']:
    counts_dict[access_caption] += 1

sorted_counts = sorted(counts_dict.items(), key=lambda item: item[1], reverse=True)
top_captions = dict(sorted_counts[:30])
rare_captions = dict(sorted_counts[-20:])

In [None]:

fig = px.bar(x=top_captions.keys(), y=top_captions.values(),
             labels={'x': 'Accessibilty caption', 'y': 'Count'},
             title='Top Accessibilty caption')

# Wyświetl wykres
fig.show()

In [None]:

fig = px.bar(x=rare_captions.keys(), y=rare_captions.values(),
             labels={'x': 'Accessibilty caption', 'y': 'Count'},
             title='Top rarest Accessibilty caption')

# Wyświetl wykres
fig.show()

In [None]:
[x for x in posts_df['accessibility_caption'].unique() if str(x)[:5] != 'Image']

## is_video

In [None]:
simple_check_column(posts_df, 'is_video')

## edge_media_to_comment

### Basic

In [None]:
media_comment_df = edge_check_column(posts_df, 'edge_media_to_comment')

### Depth

### Delete

In [None]:
del media_comment_df

## edge_sidecar_to_children

### Basic

In [None]:
sidecar_children_df = edge_check_column(posts_df, 'edge_sidecar_to_children')

### Depth

### Delete

In [None]:
del sidecar_children_df

## fact_check_information

In [None]:
simple_check_column(posts_df, 'fact_check_information')