In [None]:
import json
import pandas as pd
import os

## Data Loading

In [10]:
folder_path = '/home/ivan/code/kostovI/facebook_analyser/raw_data'
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

messages_raw = []

for file in files:

    with open(f'{folder_path}/{file}', encoding='utf-8') as f:
        message_dict = json.load(f)
    message = message_dict['messages']
    messages_raw += message

In [11]:
#Function fixes wrong unicode encodings of cyrillic script
def cyrillic_decoder(bad_decoding):
    return bad_decoding.encode('latin1').decode('utf-8')

In [12]:
name_chat = cyrillic_decoder(message_dict['title'])
members = [cyrillic_decoder(x['name']) for x in message_dict['participants']]
messages_bad = message_dict['messages']

## Preprocessing (Decode badly encoded characters to cyrillic)

In [13]:
messages = []

for message in messages_raw:

    if message.get('content','no_content') != 'no_content':
        message_dict = {'sender_name': cyrillic_decoder(message['sender_name']),
                        'timestamp_ms': message['timestamp_ms'],
                        'content': cyrillic_decoder(message['content'])}
        messages.append(message_dict)


In [22]:
messages_df = pd.DataFrame.from_dict(messages)
messages_df['timestamp_ms'] = pd.to_datetime(messages_df['timestamp_ms'], unit='ms')
messages_df['date'] = messages_df['timestamp_ms'].dt.date
messages_df.sort_values(by='timestamp_ms', ascending=False, inplace=True)

In [23]:
messages_df.head(20)

Unnamed: 0,sender_name,timestamp_ms,content,date
32486,–ò–≤–∞–Ω –ù–µ—Å—Ç–æ—Ä–æ–≤,2025-05-24 10:34:41.256,–ß–µ—Å—Ç–∏—Ç –ø—Ä–∞–∑–Ω–∏–∫!,2025-05-24
32487,Dobri Madzharov,2025-05-24 10:28:10.920,ü§çüíö‚ù§Ô∏è,2025-05-24
32488,Dobri Madzharov,2025-05-24 10:27:48.040,4estit den na Bulgarskata kultura i pismenost,2025-05-24
32489,–ò–≤–∞–Ω –ù–µ—Å—Ç–æ—Ä–æ–≤,2025-05-11 23:08:38.677,–¢–∞–∑–∏ –≥–æ–¥–∏–Ω–∞ –Ω–∏–≤–æ—Ç–æ –±–µ—à–µ –¥–æ—Å—Ç–∞ –¥–æ–±—Ä–µ. –°–∏–ª–Ω–æ –≤–ø–µ...,2025-05-11
32490,Dobri Madzharov,2025-05-11 21:05:06.682,–°–≤–∏–Ω—Å–∫–æ,2025-05-11
32491,–ò–≤–∞–Ω –ù–µ—Å—Ç–æ—Ä–æ–≤,2025-05-11 20:45:09.722,–•–æ—Ä–∞ —Ç–∞–∑–∏ –≥–æ–¥–∏–Ω–∞ –Ω–∞ –ù–î–ö –õ—É–Ω–∞—Ä –µ –±—Ä—É—Ç–∞–ª–Ω–æ,2025-05-11
32492,Kaloyan Simov,2025-05-11 15:13:09.121,"–ê—Ö–∞, –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ –¥–∞–ª–∏ –¥—Ä—É–≥–∏—Ç–µ —Å–∞ –¥–æ–±—Ä–∏ –∏–ª–∏ —Å–∞ —Å—É...",2025-05-11
32493,Ivailo Yankulov,2025-05-11 14:33:29.644,Aa ne ot nqkude q e vzela te qvno taka gi prav...,2025-05-11
32494,Kaloyan Simov,2025-05-11 10:59:11.478,–•–∞—Ö–∞—Ö–∞—Ö–∞. –ö–æ–π —è –µ –Ω–∞–ø—Ä–∞–≤–∏–ª? –ù—è–∫–æ–π –æ—Ç –≤–∞—à–∏—Ç–µ/—Å–µ...,2025-05-11
32495,Deliya Stoilova,2025-05-10 19:26:46.005,–ú–µ–≥–∞ —Å–ª–∞–¥–∫–∞!,2025-05-10
