In [None]:
import json
import pandas as pd
import os

## Data Loading

In [10]:
folder_path = '/home/ivan/code/kostovI/facebook_analyser/raw_data'
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

messages_raw = []

for file in files:

    with open(f'{folder_path}/{file}', encoding='utf-8') as f:
        message_dict = json.load(f)
    message = message_dict['messages']
    messages_raw += message

In [11]:
#Function fixes wrong unicode encodings of cyrillic script
def cyrillic_decoder(bad_decoding):
    return bad_decoding.encode('latin1').decode('utf-8')

In [12]:
name_chat = cyrillic_decoder(message_dict['title'])
members = [cyrillic_decoder(x['name']) for x in message_dict['participants']]
messages_bad = message_dict['messages']

## Preprocessing 

### 1. Decode badly encoded characters to cyrillic
### 2. Identify the language of each text (English, Bulgarian, 6liokavica). 6liokavica will be identified as Bulgarian with low score
### 2. Translate 6liokavica to cyrillic. Transliteration from latin script to cyrillic

In [24]:
messages = []

for message in messages_raw:

    if message.get('content','no_content') != 'no_content':
        message_dict = {'sender_name': cyrillic_decoder(message['sender_name']),
                        'timestamp_ms': message['timestamp_ms'],
                        'content': cyrillic_decoder(message['content'])}
        messages.append(message_dict)


In [25]:
messages_df = pd.DataFrame.from_dict(messages)
messages_df['timestamp_ms'] = pd.to_datetime(messages_df['timestamp_ms'], unit='ms')
messages_df['date'] = messages_df['timestamp_ms'].dt.date
messages_df.sort_values(by='timestamp_ms', ascending=False, inplace=True)

In [None]:
messages_df.head(20)