In [6]:
import pandas as pd

lappis_data = pd.read_csv('../extract_contributions/contributors_summary_lappis.csv')
eps_data = pd.read_csv('../extract_contributions/contributors_summary_eps_mds.csv')

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)  
    IQR = Q3 - Q1 
    upper_bound = Q3 + 1.5 * IQR  
    cleaned_df = df[df[column] <= upper_bound]
    return cleaned_df

lappis_cleaned = remove_outliers(lappis_data, 'contributions')
eps_cleaned = remove_outliers(eps_data, 'contributions')

lappis_cleaned, eps_cleaned


(                  user  contributions  repositories  primary_language  \
 0                 User              1            11                 C   
 1          davidCarlos             22            24            Python   
 2        StrangeUnit28            278            15            Python   
 3          MaiconMares            441            41        JavaScript   
 4   PauloGoncalvesLima            202            26            Python   
 ..                 ...            ...           ...               ...   
 74          RochaCarla             71            36              HTML   
 75          laurapinos             36            21            Python   
 76       ArturVinicius             44            26            Python   
 77             edudsan             37            17              HTML   
 78        leomichalski            250            64  Jupyter Notebook   
 
                                 monthly_contributions  \
 0   {'2023-06': 0, '2023-07': 0, '2023-08': 0, '20.

In [12]:
import pandas as pd
from sklearn.utils import resample

# Assumindo que lappis_cleaned e eps_cleaned já estejam carregados

# Obter o tamanho do dataframe menor (lappis_cleaned)
lappis_size = len(lappis_cleaned)

# Subamostrar aleatoriamente o dataframe eps_cleaned para igualar ao tamanho do lappis_cleaned
eps_balanced = resample(eps_cleaned, 
                        replace=False,    # Sem substituição
                        n_samples=lappis_size,  # Igualar ao tamanho do lappis_cleaned
                        random_state=42)  # Reprodutibilidade

# Exibir os dataframes balanceados
print(f"Tamanho original do EPS Cleaned: {len(eps_cleaned)}")
print(f"Tamanho original do LAPPIS Cleaned: {lappis_size}")
print(f"Tamanho do EPS Balanceado: {len(eps_balanced)}")

# Retornar os dataframes
lappis_cleaned


Tamanho original do EPS Cleaned: 453
Tamanho original do LAPPIS Cleaned: 66
Tamanho do EPS Balanceado: 66


Unnamed: 0,user,contributions,repositories,primary_language,monthly_contributions,contribution_types
0,User,1,11,C,"{'2023-06': 0, '2023-07': 0, '2023-08': 0, '20...","{'commits': 1, 'pull_requests': 0, 'issues': 0..."
1,davidCarlos,22,24,Python,"{'2023-06': 0, '2023-07': 0, '2023-08': 0, '20...","{'commits': 22, 'pull_requests': 0, 'issues': ..."
2,StrangeUnit28,278,15,Python,"{'2023-06': 105, '2023-07': 57, '2023-08': 1, ...","{'commits': 207, 'pull_requests': 20, 'issues'..."
3,MaiconMares,441,41,JavaScript,"{'2023-06': 34, '2023-07': 74, '2023-08': 30, ...","{'commits': 106, 'pull_requests': 16, 'issues'..."
4,PauloGoncalvesLima,202,26,Python,"{'2023-06': 39, '2023-07': 34, '2023-08': 4, '...","{'commits': 79, 'pull_requests': 2, 'issues': ..."
...,...,...,...,...,...,...
74,RochaCarla,71,36,HTML,"{'2023-06': 3, '2023-07': 0, '2023-08': 3, '20...","{'commits': 40, 'pull_requests': 1, 'issues': ..."
75,laurapinos,36,21,Python,"{'2023-06': 0, '2023-07': 1, '2023-08': 0, '20...","{'commits': 29, 'pull_requests': 4, 'issues': ..."
76,ArturVinicius,44,26,Python,"{'2023-06': 0, '2023-07': 0, '2023-08': 2, '20...","{'commits': 37, 'pull_requests': 4, 'issues': ..."
77,edudsan,37,17,HTML,"{'2023-06': 21, '2023-07': 1, '2023-08': 0, '2...","{'commits': 34, 'pull_requests': 2, 'issues': ..."


In [10]:
lappis_summary = lappis_cleaned.describe()

eps_summary = eps_cleaned.describe()

display(lappis_summary, eps_summary)

Unnamed: 0,contributions,repositories
count,66.0,66.0
mean,148.984848,30.136364
std,140.143936,24.602618
min,0.0,0.0
25%,31.5,11.25
50%,108.0,25.5
75%,229.75,38.5
max,633.0,111.0


Unnamed: 0,contributions,repositories
count,453.0,453.0
mean,200.200883,26.275938
std,184.379699,19.215851
min,0.0,0.0
25%,62.0,11.0
50%,149.0,23.0
75%,275.0,36.0
max,783.0,138.0


In [11]:
import pandas as pd

# Função para calcular as estatísticas e criar um DataFrame com os resultados
def calculate_statistics(df, group_name):
    stats = {
        'Metric': ['Number of Students', 
                   'Average Contributions', 
                   'Standard Deviation of Contributions', 
                   'Median Contributions', 
                   'Maximum Contributions', 
                   'Average Number of Repositories', 
                   'Standard Deviation of Repositories', 
                   'Median Number of Repositories', 
                   'Maximum Number of Repositories'],
        
        group_name: [
            len(df),  # Número de estudantes
            df['contributions'].mean(),  # Média de contribuições
            df['contributions'].std(),  # Desvio padrão das contribuições
            df['contributions'].median(),  # Mediana das contribuições
            df['contributions'].max(),  # Máximo de contribuições
            df['repositories'].mean(),  # Média de repositórios
            df['repositories'].std(),  # Desvio padrão dos repositórios
            df['repositories'].median(),  # Mediana de repositórios
            df['repositories'].max()  # Máximo de repositórios
        ]
    }
    
    return pd.DataFrame(stats)

lappis_stats = calculate_statistics(lappis_cleaned, 'Regular Students')
eps_stats = calculate_statistics(eps_balanced, 'Laboratory Students')

final_stats = pd.merge(lappis_stats, eps_stats, on='Metric')

print(final_stats)


                                Metric  Regular Students  Laboratory Students
0                   Number of Students         66.000000            66.000000
1                Average Contributions        148.984848           149.272727
2  Standard Deviation of Contributions        140.143936           138.138007
3                 Median Contributions        108.000000           114.000000
4                Maximum Contributions        633.000000           518.000000
5       Average Number of Repositories         30.136364            22.621212
6   Standard Deviation of Repositories         24.602618            13.815003
7        Median Number of Repositories         25.500000            20.000000
8       Maximum Number of Repositories        111.000000            59.000000
