# LionAGI Introduction 2: Data Processing

LionAGI is very efficient and intuitive to handle data

In [35]:
# import libraries
from pathlib import Path
import pandas as pd
import lionagi as li

In [36]:
from timeit import default_timer as timer
start = timer()

### 1. Parameters

In [37]:
# you can customize the path to your own directory
project_name = 'metagpt'
repo_name = 'MetaGPT-main'
required_exts = ['.py']

folder_names = ["actions", "document_store", "learn", "management", 
                "memory", "prompts", "provider", "roles", "tools", 
                "utils", "mains"]
source_dir = Path.cwd() / 'gitrepo' / repo_name / project_name

### 2. Read directories

In [38]:
# generate path for each directory with l_return
# l_return is a handler for list inputs
sources = li.l_return(folder_names, lambda x: source_dir / x)

# let's take a look at the first 5 paths
sources[:5]

[PosixPath('/Users/lion/Documents/GitHub/gitco/data/gitrepo/MetaGPT-main/metagpt/actions'),
 PosixPath('/Users/lion/Documents/GitHub/gitco/data/gitrepo/MetaGPT-main/metagpt/document_store'),
 PosixPath('/Users/lion/Documents/GitHub/gitco/data/gitrepo/MetaGPT-main/metagpt/learn'),
 PosixPath('/Users/lion/Documents/GitHub/gitco/data/gitrepo/MetaGPT-main/metagpt/management'),
 PosixPath('/Users/lion/Documents/GitHub/gitco/data/gitrepo/MetaGPT-main/metagpt/memory')]

In [39]:
# read files from source directories, 
# you can also save all files together in a csv file

files = li.dir_to_files(_dir=sources, _ext=required_exts, to_csv=True, 
                        _project=project_name, timestamp=True, verbose=True)

116 logs saved to data/logs/sources/2023_10_21_19_18_30_metagpt_sources.csv


In [40]:
# let's take a look at how do files look like
test = files[25]

print(f"Files are read into {type(test)} type")
print(f"By default files include {test.keys()}\n\n---------------------Sample---------------------")
print(test['content'][:100])

Files are read into <class 'dict'> type
By default files include dict_keys(['project', 'folder', 'file', 'content'])

---------------------Sample---------------------
from dataclasses import dataclass
from typing import List

from qdrant_client import QdrantClient
fr


In [41]:
# let's get some statistics about the files
lens = li.l_return(files, lambda x: len(x['content']))
min_, max_, avg_ = min(lens), max(lens), sum(lens)/len(lens)

print(f"Minimum length of content in files is {min_} in characters")
print(f"Maximum length of content in files is {max_} in characters")
print(f"Average length of content in files is {int(avg_)} in characters")

Minimum length of content in files is 122 in characters
Maximum length of content in files is 10383 in characters
Average length of content in files is 2910 in characters


### 3. Split to chunks

In [42]:
# the files seem to be fairly uneven in terms of length
# which could bring problems in our subsequent analysis, we can stardardize them into chunks 

# one convinient way to do this is via file_to_chunks function, it breaks the files into organized chunks
f = lambda x: li.file_to_chunks(x, field='content', 
                                chunk_size=1000,        # the main chunk size
                                overlap=0.2,            # the amount of overlap, here is 10% each side
                                threshold=200)           # the lowest length of last chunk to be seperated
chunks = li.l_return(files, f)

# here are some statistics about the chunks, now they seem more uniform
lens = li.l_return(li.to_lst(chunks, flat=True), lambda x: len(x["chunk_content"]))
min_, max_, avg_ = min(lens), max(lens), sum(lens)/len(lens)

print(f"Minimum length of content in chunk is {min_} characters")
print(f"Maximum length of content in chunk is {max_} characters")
print(f"Average length of content in chunk is {int(avg_)} characters")

Minimum length of content in chunk is 122 characters
Maximum length of content in chunk is 1398 characters
Average length of content in chunk is 1032 characters


In [43]:
print("""
Though the chunk_size is set to be 1000 in this case, the actual chunk_size depends on a number of factors:
- if the file is originally shorter than 1000, we will keep whole file as a chunk
- we will chunk the files by 1000 characters, additionally
    - we add overlap for each chunk with neighbor. For example, if
        - first chunk would have one side of neighbor, it will be 1000 + 1000 * 0.2/2 = 1100
        - second chunk would have two sides of neighbor, it will be 1000 + 1000 * 0.2 = 1200
    - last chunk if longer than threshold, it will be 1000*0.2/2 + remaining length
    - if the remaining length is shorter than threshold, we will merge it with the preceeding chunk
""")


Though the chunk_size is set to be 1000 in this case, the actual chunk_size depends on a number of factors:
- if the file is originally shorter than 1000, we will keep whole file as a chunk
- we will chunk the files by 1000 characters, additionally
    - we add overlap for each chunk with neighbor. For example, if
        - first chunk would have one side of neighbor, it will be 1000 + 1000 * 0.2/2 = 1100
        - second chunk would have two sides of neighbor, it will be 1000 + 1000 * 0.2 = 1200
    - last chunk if longer than threshold, it will be 1000*0.2/2 + remaining length
    - if the remaining length is shorter than threshold, we will merge it with the preceeding chunk



In [44]:
# chunks are a multi-dimensional list, its first dimension is the file
print(f"There are in total {len(chunks)} files")

# its second dimension is the chunks in each file, 
# we can examine the total numbers of chunks by flattening the list using to_lst with flat=True
print(f"There are in total {len(li.to_lst(chunks,flat=True))} chunks")

There are in total 116 files
There are in total 379 chunks


In [45]:
# you can check the structure of chunks by converting it to a dataframe
# each row is a file, each element is one chunk

pd.DataFrame(chunks).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,"{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...",,,,,,,,
1,"{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...",
2,"{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...",,,,,,,,
3,"{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...",,,,,,,,,
4,"{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...","{'project': 'metagpt', 'folder': 'actions', 'f...",,


In [46]:
# you can also save the chunks into a csv file
li.to_csv(li.to_lst(chunks, flat=True), "metagpt_chunks.csv")

### 4. Aggregate into bins

In [47]:
# you can also put chunks into bins for aggregation
# let's begin by reading a dataframe of the data we previously chunked and saved

df = pd.read_csv("metagpt_chunks.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,project,folder,file,chunk_overlap,chunk_threshold,file_chunks,chunk_id,chunk_size,chunk_content
0,metagpt,actions,write_code.py,0.2,200,4,1,1100,#!/usr/bin/env python\n# -*- coding: utf-8 -*-...
1,metagpt,actions,write_code.py,0.2,200,4,2,1200,"I. IF NO API, IMPLEMENT IT.\n2. Requirement: B..."
2,metagpt,actions,write_code.py,0.2,200,4,3,1338,"return any(i in filename for i in [""mp3"", ""w..."
3,metagpt,actions,research.py,0.2,200,11,1,1100,#!/usr/bin/env python\n\nfrom __future__ impor...
4,metagpt,actions,research.py,0.2,200,11,2,1200,rovide up to 2 necessary keywords related to y...


In [48]:
# Let's say you conducted certain llm analysis or similar data transformation on the chunks
# and you now you want to put them in groups(bins) of certain range of length 

inputs = li.to_lst(df.chunk_content, dropna=True, flat=True)
print(f"There are in total {len(inputs)} inputs")

There are in total 379 inputs


In [49]:
# we can set a upper limit for how long each bin's content can be

bins = li.get_bins(inputs, upper=8000)
print(f"There are in total {len(bins)} bins")

bins = li.get_bins(inputs, upper=4000)
print(f"There are in total {len(bins)} bins")

bins = li.get_bins(inputs, upper=2000)
print(f"There are in total {len(bins)} bins")

There are in total 53 bins
There are in total 113 bins
There are in total 303 bins


In [50]:
elapse = timer() - start

In [51]:
print(f"Total runtime: {elapse:.03f} seconds")

Total runtime: 0.070 seconds
