## Model 1

In [7]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from tools.to_read import *
from tools.to_do import *
from tools.to_plot import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Platform types and index categories
platforms = ['reddit', 'usenet', 'voat', 'gab', 'facebook', 'twitter']

# Create an empty list to store data for each platform
all_data = []

# Iterate over the different platform types and index types
for platform in tqdm(platforms):
    # Load and preprocess data for the platform
    df = read_and_rename(platform, root)
    df.dropna(subset=['user_id', 'post_id'], inplace=True)
    
    # Calcolare il numero di utenti per ogni post
    user_count_per_post = df.groupby('post_id')['user_id'].nunique().reset_index()
    user_count_per_post.columns = ['post_id', 'user_count']

    # Aggiungere la colonna 'user_count' al DataFrame originale
    df = df.merge(user_count_per_post, on='post_id', how='left')

    # Identificare le coppie di utenti e post duplicati e rinominare 'reentry' in 'phi'
    df['1-alpha'] = df.duplicated(subset=['user_id', 'post_id'], keep=False)

    # Contare quante volte si ripete ciascuna coppia di 'user_id' e 'post_id'
    df['K'] = df.groupby(['user_id', 'post_id'])['user_id'].transform('count')

    # Spezzare 'user_count' in categorie (0, 30, 90, 150)
    bin_start = 10
    bin_end = 500

    bins = np.logspace(np.log10(bin_start), np.log10(bin_end), num=13)
    labels = [f'{int(bins[i]):,}-{int(bins[i+1]):,}' for i in range(len(bins)-1)]
    df['user_count_binned'] = pd.cut(df['user_count'], bins=bins, labels=labels, right=False)

    # Selezionare un campione casuale di 100.000 righe per piattaforma
    platform_data = df[['user_count_binned', '1-alpha', 'K']]  # random_state per riproducibilità
    platform_data['platform']= platform  
    all_data.append(platform_data)


# Concatenare tutti i dati in un unico DataFrame
final_data = pd.concat(all_data, ignore_index=True)

# Esportare i dati finali in un file CSV
final_data.to_csv(root + 'src/output/model_crowd/k_vs_crowd.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  platform_data['platform']= platform
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  platform_data['platform']= platform
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  platform_data['platform']= platform
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

... continue on r ...

In [None]:
# Carica il pacchetto necessario
library(dplyr)

# Leggi i dati dal file CSV
file_path <- "/home/jacoponudo/Documents/from_niche_to_mainstream/src/output/model_crowd/k_vs_crowd.csv"
data <- read.csv(file_path)

data$user_count_binned=fill_na(data$user_count_binned, "0-10")
# Installare il pacchetto pscl se non è già installato
if (!require(pscl)) {
  install.packages("pscl")
}

# Caricare il pacchetto pscl
library(pscl)

# Prepara la colonna Y
data$Y = data$k - 1

# Per ogni piattaforma, crea un modello ZIP e stampa il summary
platforms <- unique(data$platform)
data$user_count_binned <- factor(data$user_count_binned, 
                                 levels = c("0-30", "30-90", "90-150", "150+"))  # Cambia questi livelli se necessario
for (platform in platforms) {
  platform_data <- data[data$platform == platform, ]
  print(mean(platform_data$k))
  
  # Creare il modello ZIP per ogni piattaforma
  model_zip <- zeroinfl(Y ~ user_count_binned, data = platform_data, dist = "poisson")
  
  # Stampa il riassunto del modello
  cat("\nSummary for platform:", platform, "\n")
  print(summary(model_zip))
}



## Model 2

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm
from tools.to_read import *
from tools.to_do import *
from tools.to_plot import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import numpy as np


# Platform types and index categories
platforms = ['reddit', 'usenet', 'voat', 'gab', 'facebook', 'twitter']

# Parameters for data filtering and processing
ignore_under = 50  # Minimum outreach threshold to avoid U-shaped trends
time_window = 12  # Time window for smoothing the time series (weeks)
correction = 10  # Maximum value of interaction count for corrections

# Create an empty list to store data for each platform
all_data = []

# Iterate over the different platform types and index types
for platform in tqdm(platforms):
        # Load and preprocess data for the platform
        df = read_and_rename(platform, root)
        df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert timestamp to datetime
        df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp

        # Group by 'page_id' and 'week' to count unique users
        weekly_unique_users = df.groupby(['page_id', 'week'])['user_id'].nunique().reset_index()
        weekly_unique_users.rename(columns={'user_id': 'unique_users_count'}, inplace=True)
        weekly_unique_users = weekly_unique_users.sort_values(by=['page_id', 'week'])
        
        # Apply a moving average for smoothing
        weekly_unique_users['smoothed_users_count'] = (
            weekly_unique_users.groupby('page_id')['unique_users_count']
            .rolling(window=time_window, min_periods=1)
            .mean().reset_index(level=0, drop=True)
        )
        # Create a new 'reentry' column to flag duplicates for each user_id and post_id combination
        df['reentry'] = df.duplicated(subset=['user_id', 'post_id'], keep=False).astype(int)

        # Now drop duplicates based on 'user_id' and 'post_id'
        df_no_duplicates = df.drop_duplicates(subset=['user_id', 'post_id'])
        merged_data = pd.merge(weekly_unique_users, df_no_duplicates, on=['page_id', 'week'], how='right')
        X = (merged_data[['smoothed_users_count']])
        y = merged_data['reentry']

        # Aggiungi un termine di intercetta per il modello
        X = sm.add_constant(X)

        model = sm.Logit(y, X)

        # Allenare il modello
        result_model = model.fit()

        # Mostra il summary del modello
        print(f"Platform: {platform}")
        print(result_model.summary())



        # Aggiungi una colonna 'platform' per ogni piattaforma
        merged_data['platform'] = platform
        
        # Aggiungi i dati della piattaforma corrente alla lista
        all_data.append(merged_data)

import pandas as pd

# Numero di righe da prendere da ogni data
num_rows = 1220136

# Lista per memorizzare i campioni
sampled_data = []

for data in all_data:
    # Se il dataset ha più righe del numero desiderato, estrai un campione casuale
    if len(data) > num_rows:
        sampled_data.append(data.sample(n=num_rows, random_state=42))
    else:
        # Altrimenti, prendi tutte le righe
        sampled_data.append(data)

# Numero di righe da prendere da ogni data
num_rows = 1220136

# Lista per memorizzare i campioni
sampled_data = []

for data in all_data:
    # Se il dataset ha più righe del numero desiderato, estrai un campione casuale
    if len(data) > num_rows:
        sampled_data.append(data.sample(n=num_rows, random_state=42))
    else:
        # Altrimenti, prendi tutte le righe
        sampled_data.append(data)

# Concatenare tutti i campioni
final_data = pd.concat(sampled_data, ignore_index=True)

final_data.to_csv(root + 'src/output/model_2/reentry_vs_outreach.csv', index=False)

  df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp
  df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp
  df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp
  df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp
  df['week'] = df['timestamp'].dt.to_period('W')  # Extract the week from timestamp
100%|██████████| 6/6 [05:10<00:00, 51.69s/it]


KeyboardInterrupt: 

continue on model2.r


In [None]:
# Carica il pacchetto necessario
library(dplyr)

# Leggi i dati dal file CSV
file_path <- "/home/jacoponudo/Documents/from_niche_to_mainstream/src/output/model_outreach/reentry_vs_outreach.csv"
data <- read.csv(file_path)

# Crea un campione casuale del 10% dei dati (se necessario)
set.seed(42)  # Imposta il seme per la riproducibilità
sampled_data <- sample_frac(data, 1)  # Rimuovi questa linea se vuoi usare l'intero dataset

# Visualizza le prime righe del campione per verificarne il contenuto
head(sampled_data)

# Assicurati che 'reentry' sia una variabile bina
sampled_data$reentry <- as.factor(sampled_data$reentry)

# Crea il modello di regressione logistica
log_model <- glm(reentry ~ platform *smoothed_users_count, 
                 data = sampled_data, 
                 family = binomial())

# Sommario del modello
summary(log_model)
library(ggplot2)

# Calcola gli odds ratio
odds_ratios <- exp(coef(log_model))

# Crea un data frame per la visualizzazione
odds_ratios_df <- data.frame(
  Variable = names(odds_ratios),
  Odds_Ratio = odds_ratios
)

# Plot dei coefficienti (Odds Ratio)
ggplot(odds_ratios_df, aes(x = reorder(Variable, Odds_Ratio), y = Odds_Ratio)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Ruota il grafico per visualizzare meglio i nomi delle variabili
  labs(title = "Odds Ratios dei Coefficienti del Modello",
       x = "Variabile",
       y = "Odds Ratio") +
  theme_minimal()


# Suddividi il dataset per piattaforma
platforms <- unique(sampled_data$platform)

# Crea una lista per memorizzare i risultati
results_list <- list()

# Applica il modello di regressione logistica per ogni piattaforma
for (platform in platforms) {
  # Filtro il dataset per la piattaforma corrente
  platform_data <- sampled_data[sampled_data$platform == platform, ]
  
  # Crea il modello di regressione logistica
  log_model <- glm(reentry ~ smoothed_users_count, 
                   data = platform_data, 
                   family = binomial())
  
  # Estrai i risultati del modello
  model_summary <- summary(log_model)
  coefficients <- model_summary$coefficients
  p_values <- coefficients[, 4]  # Estrai i p-value
  
  # Estrai altre informazioni utili
  deviance <- log_model$deviance
  aic <- log_model$aic
  z_values <- coefficients[, 3]  # Z-values
  
  # Salva i risultati in un data frame
  results <- data.frame(
    platform = platform,
    coefficient = coefficients[, 1],
    std_error = coefficients[, 2],
    z_value = z_values,
    p_value = p_values,
    deviance = deviance,
    aic = aic
  )
  
  # Aggiungi i risultati alla lista
  results_list[[platform]] <- results
}

# Combina tutti i risultati in un unico data frame
final_results <- do.call(rbind, results_list)

# Visualizza la tabella finale con tutte le informazioni
print(final_results)
