#### ALGORITMO - ANÁLISE ESTATÍSTICA:

O objetivo deste algoritmo é realizar análises e transformações de dados obtidos através de raspagem no site BusinessOfApps após processos de ETL.

A diretoria da BusinessOfApps concedeu autorização para este projeto de raspagem de dados/ETL.



In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

In [3]:
def interpolate_data(file_name, num_customers):
    # Load the data into a pandas dataframe
    df = pd.read_csv(file_name)

    # Create the new age range categories
    new_age_ranges = ['13-17', '18-24' , '25-34' , '35-44' , '45-54' , '55-64' , '65+']

    # Create a list of age range start and end points
    age_range_starts = [int(range.split('-')[0]) if '+' not in range else int(range.split('+')[0]) for range in df['idade']]
    age_range_ends = [int(range.split('-')[1]) if '+' not in range else int(range.split('+')[0]) for range in df['idade']]
    age_range_midpoints = [(start + end) / 2 for start, end in zip(age_range_starts, age_range_ends)]

    num_customers_in_range = [int(num_customers * proportion / 100) for proportion in df['percentual']] # Calculate the number of customers in each age range based on the proportion and total number of customers

    f = interp1d(age_range_midpoints, num_customers_in_range, kind='linear')
    new_num_customers_in_range = f(np.linspace(min(age_range_midpoints), max(age_range_midpoints), len(new_age_ranges))) # Perform the linear interpolation

    new_percentages = [(num / num_customers * 100) for num in new_num_customers_in_range] # Calculate the percentage of customers in each new age range
    new_percentages = [round(percentage, 2) for percentage in new_percentages] # Round the percentages to two decimal places

    total_percentage = sum(new_percentages)
    new_percentages = [round(percentage * (100 / total_percentage), 0) for percentage in new_percentages] # Adjust the percentages to ensure that they add up to 100 and round the result.

    df_interp = pd.DataFrame({'idade': new_age_ranges, 'percentual': new_percentages}) # Create a new dataframe with the interpolated data 
    df_interp.to_csv(f'Interp_{file_name.split(".")[0]}.csv', index=False) # Save the interpolated data to a new CSV
interpolate_data('dIdadeInstagram.csv', 2335000000) # Interpolate data for Company A
interpolate_data('dIdadeTikTok.csv', 1534000000) # Interpolate data for Company B