# Large CDR synthetic record generation

### 1. Create the first data: 1000M ROWS

In [29]:
import random
import datetime
import pandas as pd
from google.cloud import bigquery
import concurrent.futures
import multiprocessing
import time

project_id = ""
dataset_id = f"telco"
table_id = "cdr"
num_rows_batch = 100 #1M
num_cycles = 1
cpu_factor = 1

In [3]:
def generate_wom_cdr_data(num_records):
    # Define possible values for CDR fields
    call_results = ["Answered", "Busy", "No Answer", "Failed"]
    chilean_area_codes = ["2", "32", "33", "34", "35", "41", "42", "43", "45", "51", "52", "53", "55", "57", "58", "61", "63", "64", "65", "67", "71", "72", "73", "75"] 
    mobile_prefixes = ["9"]  # Chilean mobile numbers typically start with 9

    # Generate CDR records
    records = []
    for i in range(num_records):
        start_time = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 365), 
                                                                 hours=random.randint(0, 23), 
                                                                 minutes=random.randint(0, 59),
                                                                 seconds=random.randint(0, 1))
        call_duration = random.randint(10, 3000)  # Duration in seconds
        end_time = start_time + datetime.timedelta(seconds=call_duration)

        record = {
            "CALLING_COMPANY": "WOM",
            "CALLING_NUM": f"+56{random.choice(mobile_prefixes)}{random.randint(10000000, 99999999)}",  # WOM mobile
            "CALLED_COMPANY": random.choice(["Entel", "Movistar", "Claro", "WOM"]),  # Other Chilean telcos
            "CALLED_NUMBER": f"+56{random.choice([random.choice(chilean_area_codes), random.choice(mobile_prefixes)])}{random.randint(1000000, 99999999)}",
            "START_TIME": start_time,
            "END_TIME": end_time,
            "CHARGE": round(random.uniform(0.05, 2.50), 2),  # Sample charges
            "CALL_RESULT": random.choice(call_results)
        }
        records.append(record)

    # Create a DataFrame
    df = pd.DataFrame(records)
    return df

In [4]:
def insert_into_bigquery(project_id, dataset_id, table_id, df):
    client = bigquery.Client(project=project_id)

    # Get the table reference
    table_ref = client.dataset(dataset_id).table(table_id)

    # Configure the job
    job_config = bigquery.LoadJobConfig(
        # Autodetect the schema from the DataFrame
        autodetect=True,
        # Specify the write disposition (append to existing table, replace, etc.)
        write_disposition="WRITE_APPEND",  # Or WRITE_TRUNCATE, WRITE_EMPTY
        # Source format is CSV (you can change this if needed)
        source_format=bigquery.SourceFormat.CSV
    )

    # Load the data into BigQuery
    job = client.load_table_from_dataframe(
        df, table_ref, job_config=job_config
    )

    job.result()

    return len(df)

In [7]:
def create_insert_cdr(i):
    wom_cdr_data = generate_wom_cdr_data(num_rows_batch)
    print(f"Generated {len(wom_cdr_data)} synthetic rows")
    count_results = insert_into_bigquery(project_id, dataset_id, table_id, wom_cdr_data)
    print(f"******* Round {i}/{num_cycles}: Loaded {count_results} rows into {project_id}.{dataset_id}.{table_id}")

In [8]:
start_time = time.time()

cpus = multiprocessing.cpu_count()
num_threads = cpus  * cpu_factor
print(f"######### Running using {num_threads} threads for {cpus} cpus")

######### Running using 4 threads for 4 cpus


In [12]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(create_insert_cdr, range(1, num_cycles+1)) 

Generated 1000000 synthetic rows
******* Round 1/1000: Loaded 1000000 rows into lgbaeza-apps-customer.telco.cdr


In [14]:
end_time = time.time()
execution_time = end_time - start_time

In [17]:
print (f"######### Inserted {(num_cycles*num_rows_batch/1000000):,} M rows in {execution_time/60} minutes")

######### Inserted 100.0 M rows in 1.124746568997701 minutes


### 2. Run periodically a Cloud Function to copy existent rows in BQ

In [58]:
# import functions_framework # Required in Cloud Functions
import datetime
from google.cloud import bigquery
import concurrent.futures
import multiprocessing
import time

project_id = ""
dataset_id = f""
table_id = ""
num_cycles = 10
rows_to_copy = 10000000 # 10M
cpu_factor = 1
cpus = multiprocessing.cpu_count()
num_threads = cpus  * cpu_factor

In [59]:
def copy_rows(i):
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Your BigQuery query
    query = f"""
        INSERT INTO `{project_id}.{dataset_id}.{table_id}`
        SELECT * FROM `{project_id}.{dataset_id}.{table_id}` LIMIT {rows_to_copy}
    """
    print(query)

    # Start the query
    query_job = client.query(query)

    # Wait for the query to finish
    results = query_job.result()
    
    print(f"{query_job.num_dml_affected_rows} rows inserted")

### Main method

In [56]:
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(copy_rows, range(1, num_cycles+1)) 
    
end_time = time.time()
execution_time = end_time - start_time


        INSERT INTO `lgbaeza-apps-customer.telco.cdr`
        SELECT * FROM `lgbaeza-apps-customer.telco.cdr` LIMIT 100
    
100 rows inserted


In [60]:
result = f"######### Copied {(num_cycles*rows_to_copy/1000000):,}M rows in BQ in {execution_time/60} minutes"
print(result)


######### Copied 100.0M rows in BQ in 0.051648275057474775 minutes


In [None]:
# return result  # Required in Cloud Functons