In [2]:
import os
import pandas as pd
import glob
import json

In [None]:
DIALOGS_DATA_PATH = "path/to/dialogs/data"
DIALOGS_META_DATA_PATH = "path/to/dialogs/meta/data"

MERGED_DATA_PATH = "path/to/merged/data"

In [None]:
!ls ../telegram-data-collection-master/data

In [13]:
dialogs_data_files = glob.glob(f"{DIALOGS_DATA_PATH}/*.csv")
dialogs_meta_data_files = glob.glob(f"{DIALOGS_META_DATA_PATH}/*.json")

In [None]:
print(f"Found {len(dialogs_meta_data_files)} JSON files.")
print(dialogs_meta_data_files) 


## merging all dialogs_data (csv files) into one

In [None]:
df_array = []

for d in dialogs_data_files:
    try:

        local_df = pd.read_csv(d, dtype=str)
        local_df["dialog_id"] = os.path.basename(d).split(".")[0]
        df_array.append(local_df)
    except Exception as e:
        print(f"Error reading {d}: {e}")

df = pd.concat(df_array, ignore_index=True)

print(df.head())

In [None]:
df.shape

In [None]:

if not os.path.isdir(MERGED_DATA_PATH):
    os.makedirs(MERGED_DATA_PATH)

if "Unnamed: 0" in df:
    df = df.drop(["Unnamed: 0"], axis=1)


df.to_csv(f"{MERGED_DATA_PATH}/dialogs_data_all.csv", index=False)

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
min(df["date"]),max(df["date"])

In [None]:
df.groupby(["type"])["type"].count()

In [None]:
df.groupby(["type"])["duration"].sum()

## merging all dialogs_meta_data (json files) into one

In [None]:
import pandas as pd
import json

df_array = []

print(f"Found {len(dialogs_meta_data_files)} JSON files.")

for d in dialogs_meta_data_files:
    try:

        with open(d, 'r', encoding='utf-8') as file:
            data = json.load(file)
        

        dialog_id = data.get('id')
        dialog_name = data.get('name')
        dialog_type = data.get('type')
        

        users = data.get('users', [])
        

        if not users:
            df_array.append({
                'dialog_id': dialog_id,
                'name': dialog_name,
                'type': dialog_type,
                'user_id': None,
                'first_name': None,
                'last_name': None,
                'username': None,
                'phone': None
            })
        else:

            for user in users:
                df_array.append({
                    'dialog_id': dialog_id,
                    'name': dialog_name,
                    'type': dialog_type,
                    'user_id': user.get('user_id'),
                    'first_name': user.get('first_name'),
                    'last_name': user.get('last_name'),
                    'username': user.get('username'),
                    'phone': user.get('phone')
                })

    except ValueError as e:
        print(f"Error reading {d}: {e}")

df_expanded = pd.DataFrame(df_array)


df_expanded.to_csv(f"{MERGED_DATA_PATH}/dialogs_users_all_expanded.csv", index=False)


print("Final type counts in expanded DataFrame:")
print(df_expanded['type'].value_counts())


In [20]:
df_expanded.to_csv(f"{MERGED_DATA_PATH}/dialogs_users_all.csv", index=False)

In [None]:
df_expanded.shape

In [None]:
df_expanded.head(10)

In [None]:
df_expanded.groupby(["type"])["type"].count()

In [None]:
unique_groups = df_expanded[['name', 'dialog_id']]

print("Унікальні назви груп та їхні dialog_id:")
print(unique_groups)

unique_groups.to_csv('unique_groups4.csv', index=False)
print("Список унікальних назв груп і їхніх ID збережено у файл 'unique_groups.csv'")