In [99]:
#!pip install unidecode

In [100]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from collections import Counter

import re
import string
from unidecode import unidecode
from string import punctuation

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


stop_words = set(stopwords.words('portuguese'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:


def init_liwc_from_dic(file_path):
    liwc = {}
    category_names = {}
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip():  # Verificando se a linha não está em branco
                parts = line.strip().split('\t')
                word = parts[0]
                categories = parts[1:]
                liwc[word] = categories
                category_names[word] = categories
    return liwc, category_names


In [102]:
liwc_file_path = '../datasets/LIWC2007_Portugues_win.dic'
#liwc_file_path = '/content/LIWC2007_Portugues_win.dic'

# Inicializando o LIWC
liwc, category_name = init_liwc_from_dic(liwc_file_path)


In [103]:

def preprocess(texto):
    if isinstance(texto, str):
        tokens = []
        for token in word_tokenize(texto.lower(), language='portuguese'):
            if token not in stop_words and token not in punctuation:
                tokens.append(token)
        return tokens
    else:
        return []


In [104]:
def init_liwc_from_dic_category(file_path):
    liwc = {}
    category_names = {}  # Dicionário para mapear códigos de categoria para nomes de categoria
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip() and not line.startswith('%'):  # Verifica se a linha não está em branco e não é um comentário
                parts = line.strip().split('\t')
                code = parts[0]
                category = parts[1]
                liwc[code] = category
                category_names[code] = category
    return liwc, category_names


# Inicialize o LIWC a partir do arquivo .dic
liwc_category, category_names = init_liwc_from_dic_category(liwc_file_path)

In [105]:
# Função para contar palavras em cada categoria
def count_words_in_categories(texto, liwc, category_names):
    word_counts = Counter()
    total_words = 0

    tokens = preprocess(texto)
    total_words += len(tokens)

    for token in tokens:
        if token in liwc:
            categories = liwc[token]
            for code in categories:
                category_name = category_names.get(code, 'Unknown')
                word_counts[category_name] += 1

    return word_counts, total_words

In [106]:


def preprocess_text(text):

    # Função para extrair e substituir o domínio da URL
    def substituir_dominios(texto):
        # Função para extrair e substituir o domínio da URL
        def extrair_dominio(url):
            # Remove o protocolo (http://, https://, etc.) e o "www." se presente
            dominio = re.sub(r'^https?://(?:www\.)?|www\.', '', url)
            # Remove o caminho e parâmetros da URL
            dominio = re.split(r'[/?#]', dominio)[0]
            # Retorna a parte principal do domínio (antes do primeiro ponto)
            return dominio.split('.')[0]

        # Substitui URLs por seus domínios principais
        return re.sub(r'https?://(?:www\.)?\S+|www\.\S+', lambda match: extrair_dominio(match.group(0)), texto)

    # Substituir domínios
    text = substituir_dominios(text)

    # Converte para minúsculas
    text = text.lower()

    # Remove acentos
    text = unidecode(text)

    #Remover Pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs e menções
    #text = re.sub(r'http\S+|www\S+|https\S+|@\w+', '', text)

    # Substitui emojis repetidos por apenas um
    text = re.sub(r'([\U00010000-\U0010FFFF])\1+', r'\1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F700-\U0001F77F]|[\U0001F780-\U0001F7FF]|[\U0001F800-\U0001F8FF]|[\U0001F900-\U0001F9FF]|[\U0001FA00-\U0001FA6F]|[\U0001FA70-\U0001FAFF])\1+', r'\1', text)


    # Remove espaços em branco extras (início ou final) e múltiplos espaços no meio do texto
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove pontuações e caracteres especiais
    #text = re.sub(r'[^\w\s]', '', text)

    # Ajusta risadas "kkk" ou mais para "kk"
    text = re.sub(r'k{2,}|K{2,}', 'kk', text)

    # Ajusta risadas "haha" ou mais para "haha"
    text = re.sub(r'(ha){2,}', 'haha', text, flags=re.IGNORECASE)

    # Ajusta risadas "kaka" ou mais para "kaka"
    text = re.sub(r'(ka){2,}', 'kaka', text, flags=re.IGNORECASE)

    # Remove as stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])


    return text


# Sem filtro

## Telegram

In [None]:
df_telegram = pd.read_csv('../datasets/fakeTelegram.BR_2022.csv')

In [108]:
df_telegram.shape

(557586, 20)

In [109]:

# Removendo linhas com valores NaN na coluna 'pre_processed_text'
df_telegram = df_telegram.dropna(subset=['pre_processed_text'])

# Removendo trava_zap
df_telegram = df_telegram[df_telegram['trava_zap'] == False]


In [110]:
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,pre_processed_text
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,,então é fato renato o áudio que eu ouvi no wha...
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,,saiu no youtube do presidente a 8 horas atrás ...
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,,é isso nossa parte já foi quase toda feita no ...
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb,gente achei eles em uma seita maçonárica
5,2022-10-05 06:28:30,,b52442a5fbc459ae590dca0d215e32f9,,,,False,False,False,5,2022-10-05 06:29:29.046694,0.7003,0.197813,2735,Wanda Silva,Texto,telegram,,,kķkkk to rindo até agora quem disse q ia fazer...


In [111]:
df_telegram.shape

(444201, 20)

In [112]:

df_geral_telegram = df_telegram.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_telegram['text_processed'] = df_geral_telegram['pre_processed_text'].apply(preprocess_text)
df_geral_telegram = df_geral_telegram.dropna(subset=['text_processed'])


In [141]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_geral_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 2049804, 'relativ': 1432057, 'verb': 1251141, 'funct': 1121706, 'affect': 1060610, 'social': 831036, 'swear': 792786, 'humans': 732211, 'posemo': 681012, 'achieve': 670401, 'ingest': 655275, 'percept': 647123, 'insight': 646822, 'motion': 642751, 'present': 638142, 'time': 626555, 'cause': 489418, 'inhib': 448949, 'space': 404711, 'bio': 367912, 'tentat': 356985, 'certain': 354995, 'work': 344224, 'negemo': 336322, 'past': 331576, 'quant': 294044, 'money': 280899, 'feel': 268376, 'auxverb': 244005, 'incl': 242954, 'Unknown': 231055, 'leisure': 220126, 'discrep': 209507, 'adverb': 194283, 'see': 173748, 'anger': 169681, 'hear': 156979, 'pronoun': 143409, 'relig': 138782, 'ipron': 133738, 'preps': 127109, 'body': 121016, 'health': 101612, 'excl': 101611, 'sad': 100691, 'future': 95308, 'sexual': 83240, 'friend': 71762, 'death': 66863, 'conj': 54383, 'anx': 45119, 'ppron': 42747, 'home': 41039, 'number': 40579, 'family': 34143, '

In [142]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],  # Substitua 100 pelo total percentual de palavras em cada categoria
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
1.7241380536592925e-27
------------------------------------
Testing for: pronoun
Statistically significant.
7.400361284010439e-43
------------------------------------
Testing for: ipron
Statistically significant.
5.008602252695843e-43
------------------------------------
Testing for: cogmech
Statistically significant.
3.62762747679346e-16
------------------------------------
Testing for: certain
Statistically significant.
3.104826348634266e-39
------------------------------------
Testing for: past
Statistically significant.
1.2565733331677177e-39
------------------------------------
Testing for: social
Statistically significant.
1.0940859405918303e-31
------------------------------------
Testing for: insight
Statistically significant.
1.650785100476636e-34
------------------------------------
Testing for: percept
Statistically significant.
1.668793464807156e-34
------------------------------------
Testing for: hear
Statistically significant

## WhatsApp

In [None]:
df_wpp = pd.read_csv('../datasets/fakeWhatsApp.BR_2022.csv')
df_wpp.shape

(598971, 20)

In [116]:

# Removendo linhas com valores NaN na coluna 'pre_processed_text'
df_wpp = df_wpp.dropna(subset=['pre_processed_text'])

# Removendo trava_zap
df_wpp = df_wpp[df_wpp['trava_zap'] == False]

df_wpp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,pre_processed_text
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,,️ hoje que eu fique sabendo do canal lula fli...
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,mais pra que isso não aconteça nois temos quê ...
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,‎acesse este link para entrar no meu grupo do ...
7,2022-10-10 16:49:48,325720ed3339a91b8076df12c1e95c45,0e345813dcb62b0fe4d8537f311af0f1,,,https://m.kwai.com/photo/150000006567403/52240...,False,True,False,5,2022-10-10 16:49:50.051126,0.0,,F18BADED5AFA8FB0C33FE36625872DB8,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,fortes palavras da pastora damares kwai-video com
8,2022-10-10 16:49:53,8ed44a70a011285622e2b8919c2c8c3e,3b3e64e81c3e3f3bfc5ba76ddb90fb2d,0d6f2896e5941ef1933e69bbd7a32f69.jpeg,image/jpeg,,True,False,False,5,2022-10-10 16:49:54.713831,0.0,,724C233591C3A62A5231B47FF22ADE4B,558594228826.0:12@s.whatsapp.net,Imagem,whatsapp,,0d6f2896e5941ef1933e69bbd7a32f69,


In [117]:
df_wpp.shape

(255589, 20)

In [118]:

df_geral_whatsapp = df_wpp.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_whatsapp['text_processed'] = df_geral_whatsapp['pre_processed_text'].apply(preprocess_text)
df_geral_whatsapp = df_geral_whatsapp.dropna(subset=['text_processed'])


In [143]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_geral_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 1351114, 'relativ': 974090, 'verb': 865812, 'funct': 727493, 'affect': 705364, 'social': 584032, 'swear': 548899, 'humans': 513934, 'percept': 469082, 'ingest': 458834, 'present': 447577, 'motion': 445721, 'insight': 438154, 'posemo': 431618, 'achieve': 415300, 'time': 406203, 'cause': 315265, 'inhib': 303082, 'space': 271768, 'bio': 266628, 'work': 257695, 'negemo': 239890, 'tentat': 230565, 'certain': 222519, 'past': 216804, 'feel': 192035, 'quant': 186954, 'leisure': 184924, 'Unknown': 183411, 'incl': 170766, 'auxverb': 163860, 'money': 162996, 'discrep': 139780, 'see': 138772, 'anger': 121306, 'hear': 116172, 'adverb': 107474, 'pronoun': 97135, 'relig': 95201, 'ipron': 90340, 'body': 89283, 'health': 75708, 'preps': 71954, 'excl': 71856, 'sad': 71600, 'future': 57616, 'sexual': 56899, 'friend': 52753, 'death': 49679, 'conj': 38042, 'home': 33697, 'number': 33285, 'family': 31746, 'anx': 31357, 'ppron': 28595, 'we': 21588, 

In [144]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
4.0770150892627195e-26
------------------------------------
Testing for: time
Statistically significant.
1.0796485764628843e-36
------------------------------------
Testing for: present
Statistically significant.
7.362805452701915e-36
------------------------------------
Testing for: cogmech
Statistically significant.
3.9915796692040566e-20
------------------------------------
Testing for: inhib
Statistically significant.
8.16526329924539e-39
------------------------------------
Testing for: incl
Statistically significant.
1.257946596286711e-41
------------------------------------
Testing for: bio
Statistically significant.
1.4037963879174944e-39
------------------------------------
Testing for: verb
Statistically significant.
5.468043596151101e-28
------------------------------------
Testing for: sexual
Statistically significant.
3.9635440776118733e-44
------------------------------------
Testing for: social
Statistically significant.
3.

## Comparação

In [145]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: funct (Telegram: 11.592452596196328, Whatsapp: 9.430502362185331, Diff: 2.1619502340109964)
Statistically significant.
p-value: 0.0
Difference: 2.1619502340109964
------------------------------------
Testing for: pronoun (Telegram: 1.4820835712458693, Whatsapp: 1.2591624207392678, Diff: 0.22292115050660155)
Statistically significant.
p-value: 0.0
Difference: 0.22292115050660155
------------------------------------
Testing for: ipron (Telegram: 1.3821370531227473, Whatsapp: 1.1710787367023776, Diff: 0.21105831642036965)
Statistically significant.
p-value: 0.0
Difference: 0.21105831642036965
------------------------------------
Testing for: cogmech (Telegram: 21.18403191343687, Whatsapp: 17.514510474439852, Diff: 3.669521438997016)
Statistically significant.
p-value: 0.0
Difference: 3.669521438997016
------------------------------------
Testing for: certain (Telegram: 3.668753407208944, Whatsapp: 2.884517040206734, Diff: 0.7842363670022099)
Statistically significant.
p-value

### Intervalo de confiança para a diferença entre proporções

In [None]:
from statsmodels.stats.proportion import proportions_ztest, confint_proportions_2indep

# comparando apenas uma única categoria por vez
# Exemplo: "quantas vezes a categoria funct aparece no Telegram" vs "quantas vezes aparece no WhatsApp".
count = np.array([total_telegram, total_whatsapp])
nobs = np.array([total_word_count_telegram, total_word_count_whatsapp])

z_stat, p_value = proportions_ztest(count, nobs)
ci_low, ci_upp = confint_proportions_2indep(
    count1=count[0], nobs1=nobs[0],
    count2=count[1], nobs2=nobs[1],
    method='wald'
)

print(f"Z = {z_stat:.4f}, p = {p_value:.4f}")
print(f"IC 95%: ({ci_low:.4f}, {ci_upp:.4f})")

if p_value < 0.05:
    print(">> Diferença estatisticamente significativa.")
else:
    print(">> Diferença **não** significativa.")


Z = -0.1614, p = 0.8718
IC 95%: (-0.0000, 0.0000)
>> Diferença **não** significativa.


# Filtro religioso

## Telegram

In [123]:
palavras_religiosas = [
    "deus", "jesus", "misericordia", "davi",
    "salomao", "reino", "templo", "conservador",
    "pentecostal", "rcc", "renovacao", "carismatic",
    "paulo ricardo", "bernardo kuster", "herege", "ateu",
    "jerico", "heresia"
]


In [124]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_telegram = df_geral_telegram[df_geral_telegram['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [125]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_religiao_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 418801, 'relativ': 270637, 'verb': 248102, 'funct': 240143, 'affect': 229671, 'swear': 182732, 'social': 180119, 'humans': 160554, 'posemo': 146478, 'percept': 139035, 'insight': 134400, 'achieve': 127033, 'ingest': 124888, 'time': 122273, 'present': 120278, 'motion': 117143, 'cause': 96766, 'inhib': 94355, 'tentat': 79459, 'certain': 78036, 'space': 76411, 'bio': 74121, 'past': 73453, 'quant': 72721, 'negemo': 71858, 'work': 66735, 'relig': 64334, 'feel': 58928, 'incl': 51797, 'auxverb': 48409, 'discrep': 45202, 'money': 42491, 'adverb': 40377, 'anger': 36749, 'pronoun': 35996, 'leisure': 35926, 'hear': 34363, 'see': 33783, 'ipron': 32327, 'Unknown': 30674, 'health': 22526, 'sad': 22331, 'preps': 22257, 'friend': 20933, 'body': 19551, 'excl': 18822, 'future': 17896, 'sexual': 17199, 'death': 16092, 'conj': 13638, 'anx': 11570, 'family': 9531, 'ppron': 9446, 'number': 7529, 'negate': 7505, 'home': 6976, 'nonfl': 6433, 'we': 56

In [126]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
4.9956297737472374e-24
------------------------------------
Testing for: present
Statistically significant.
8.024340163230831e-34
------------------------------------
Testing for: quant
Statistically significant.
3.4236302315349527e-38
------------------------------------
Testing for: swear
Statistically significant.
1.6801342919863447e-28
------------------------------------
Testing for: social
Statistically significant.
1.0285502848426017e-28
------------------------------------
Testing for: affect
Statistically significant.
8.174965970528418e-25
------------------------------------
Testing for: posemo
Statistically significant.
1.564330312787003e-31
------------------------------------
Testing for: negemo
Statistically significant.
2.8355971328937067e-38
------------------------------------
Testing for: anger
Statistically significant.
1.1122610577461148e-41
------------------------------------
Testing for: verb
Statistically significant

## WhatsApp

In [127]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [128]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_religiao_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 383917, 'relativ': 278809, 'verb': 236790, 'funct': 222912, 'affect': 209609, 'social': 165601, 'swear': 159170, 'humans': 150248, 'posemo': 130663, 'percept': 129155, 'ingest': 128821, 'time': 125574, 'achieve': 120421, 'motion': 118413, 'present': 114076, 'insight': 113963, 'cause': 94357, 'inhib': 84786, 'space': 83464, 'bio': 73894, 'certain': 70129, 'negemo': 69543, 'tentat': 68280, 'past': 67218, 'work': 62778, 'quant': 57530, 'Unknown': 55246, 'feel': 53198, 'relig': 50927, 'incl': 47847, 'money': 46920, 'auxverb': 46067, 'discrep': 39351, 'leisure': 38679, 'anger': 36527, 'pronoun': 33156, 'adverb': 33109, 'see': 32217, 'hear': 30778, 'ipron': 29890, 'preps': 24991, 'health': 23691, 'sad': 22709, 'body': 21181, 'excl': 19096, 'sexual': 17879, 'friend': 17682, 'future': 17535, 'death': 16631, 'conj': 12926, 'family': 11642, 'number': 10471, 'anx': 9083, 'home': 8540, 'ppron': 8235, 'negate': 5934, 'we': 4943, 'assent': 

In [129]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
2.0409299466436287e-34
------------------------------------
Testing for: funct
Statistically significant.
5.596184300417009e-27
------------------------------------
Testing for: past
Statistically significant.
2.5012300986063462e-39
------------------------------------
Testing for: social
Statistically significant.
3.038418631358738e-31
------------------------------------
Testing for: affect
Statistically significant.
6.123017681788203e-28
------------------------------------
Testing for: negemo
Statistically significant.
3.984141666542258e-39
------------------------------------
Testing for: anger
Statistically significant.
4.772663409233093e-42
------------------------------------
Testing for: verb
Statistically significant.
5.390132524335926e-26
------------------------------------
Testing for: ingest
Statistically significant.
3.742438320432624e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.92

## Comparação

In [130]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: funct (Telegram: 14.256251947936299, Whatsapp: 11.974331509436597, Diff: 2.281920438499702)
Statistically significant.
p-value: 0.0
Difference: 2.281920438499702
------------------------------------
Testing for: present (Telegram: 7.1403849864201, Whatsapp: 6.127906264671661, Diff: 1.0124787217484394)
Statistically significant.
p-value: 0.0
Difference: 1.0124787217484394
------------------------------------
Testing for: quant (Telegram: 4.317131450451921, Whatsapp: 3.090382266266004, Diff: 1.226749184185917)
Statistically significant.
p-value: 0.0
Difference: 1.226749184185917
------------------------------------
Testing for: swear (Telegram: 10.848009023583014, Whatsapp: 8.55025456842621, Diff: 2.297754455156804)
Statistically significant.
p-value: 0.0
Difference: 2.297754455156804
------------------------------------
Testing for: social (Telegram: 10.692886507665593, Whatsapp: 8.895713430834634, Diff: 1.7971730768309584)
Statistically significant.
p-value: 0.0
Difference

In [131]:
from statsmodels.stats.proportion import proportions_ztest, confint_proportions_2indep

# comparando apenas uma única categoria por vez
# Exemplo: "quantas vezes a categoria funct aparece no Telegram" vs "quantas vezes aparece no WhatsApp".
count = np.array([total_telegram, total_whatsapp])
nobs = np.array([total_word_count_telegram, total_word_count_whatsapp])

z_stat, p_value = proportions_ztest(count, nobs)
ci_low, ci_upp = confint_proportions_2indep(
    count1=count[0], nobs1=nobs[0],
    count2=count[1], nobs2=nobs[1],
    method='wald'
)

print(f"Z = {z_stat:.4f}, p = {p_value:.4f}")
print(f"IC 95%: ({ci_low:.4f}, {ci_upp:.4f})")

if p_value < 0.05:
    print(">> Diferença estatisticamente significativa.")
else:
    print(">> Diferença **não** significativa.")


Z = -0.0974, p = 0.9224
IC 95%: (-0.0000, 0.0000)
>> Diferença **não** significativa.


# Filtro religioso com exclusão de palavras politicas

## Telegram

In [132]:
palavras_politicas = [ "lula", "bolsonaro", "pt", "pl", "stf", "patria", "55", "22", "13", "senadores", "lulaladrao",
                       "urnas", "alexandre", "moraes", "comunismo", "eleicao", "eleicoes", "esquerda", "direita",
                         "presidente", "tse", "fraude", "voto", "turno", "ministro"]

In [133]:
def retirar_mensagens_com_palavras_politicas(word):
    word_lower = word.lower()
    palavras_politica_lower = [palavra.lower() for palavra in palavras_politicas]

    return any(palavra in word_lower for palavra in palavras_politica_lower)


df_politico_telegram = df_religiao_telegram[~df_religiao_telegram['text_processed'].apply(lambda x: retirar_mensagens_com_palavras_politicas(x))]



In [134]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_politico_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 57069, 'relativ': 38175, 'verb': 36341, 'funct': 35066, 'affect': 32521, 'social': 26329, 'humans': 24106, 'swear': 23523, 'posemo': 21896, 'percept': 20390, 'present': 18178, 'relig': 17982, 'motion': 17573, 'insight': 17531, 'time': 17301, 'ingest': 16715, 'achieve': 16580, 'cause': 13336, 'inhib': 12974, 'tentat': 12938, 'bio': 11481, 'certain': 11138, 'negemo': 9619, 'past': 9094, 'auxverb': 8812, 'space': 8802, 'quant': 8513, 'work': 7732, 'feel': 7316, 'incl': 6721, 'money': 6284, 'hear': 6183, 'pronoun': 6055, 'discrep': 5924, 'adverb': 5814, 'leisure': 5476, 'ipron': 5120, 'anger': 5084, 'see': 4632, 'health': 3560, 'sexual': 3525, 'sad': 3099, 'body': 3002, 'future': 2992, 'friend': 2893, 'death': 2638, 'preps': 2596, 'excl': 2468, 'Unknown': 2327, 'conj': 1950, 'ppron': 1760, 'anx': 1548, 'family': 1473, 'negate': 1186, 'home': 1012, 'we': 813, 'assent': 725, 'nonfl': 642, 'number': 599, 'i': 512, 'shehe': 459, 'you'

In [135]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
2.0917444676791435e-20
------------------------------------
Testing for: time
Statistically significant.
8.414507621588082e-33
------------------------------------
Testing for: funct
Statistically significant.
4.665392927581178e-22
------------------------------------
Testing for: auxverb
Statistically significant.
1.0104540631882882e-38
------------------------------------
Testing for: present
Statistically significant.
3.218575277355279e-32
------------------------------------
Testing for: future
Statistically significant.
4.478301436897885e-43
------------------------------------
Testing for: cogmech
Statistically significant.
7.918340242764761e-12
------------------------------------
Testing for: verb
Statistically significant.
2.2617431319889685e-21
------------------------------------
Testing for: cause
Statistically significant.
1.6713959788917496e-35
------------------------------------
Testing for: motion
Statistically significan

## WhatsApp

In [136]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_politico_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [137]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_politico_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 383917, 'relativ': 278809, 'verb': 236790, 'funct': 222912, 'affect': 209609, 'social': 165601, 'swear': 159170, 'humans': 150248, 'posemo': 130663, 'percept': 129155, 'ingest': 128821, 'time': 125574, 'achieve': 120421, 'motion': 118413, 'present': 114076, 'insight': 113963, 'cause': 94357, 'inhib': 84786, 'space': 83464, 'bio': 73894, 'certain': 70129, 'negemo': 69543, 'tentat': 68280, 'past': 67218, 'work': 62778, 'quant': 57530, 'Unknown': 55246, 'feel': 53198, 'relig': 50927, 'incl': 47847, 'money': 46920, 'auxverb': 46067, 'discrep': 39351, 'leisure': 38679, 'anger': 36527, 'pronoun': 33156, 'adverb': 33109, 'see': 32217, 'hear': 30778, 'ipron': 29890, 'preps': 24991, 'health': 23691, 'sad': 22709, 'body': 21181, 'excl': 19096, 'sexual': 17879, 'friend': 17682, 'future': 17535, 'death': 16631, 'conj': 12926, 'family': 11642, 'number': 10471, 'anx': 9083, 'home': 8540, 'ppron': 8235, 'negate': 5934, 'we': 4943, 'assent': 

In [138]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
2.0409299466436287e-34
------------------------------------
Testing for: funct
Statistically significant.
5.596184300417009e-27
------------------------------------
Testing for: past
Statistically significant.
2.5012300986063462e-39
------------------------------------
Testing for: social
Statistically significant.
3.038418631358738e-31
------------------------------------
Testing for: affect
Statistically significant.
6.123017681788203e-28
------------------------------------
Testing for: negemo
Statistically significant.
3.984141666542258e-39
------------------------------------
Testing for: anger
Statistically significant.
4.772663409233093e-42
------------------------------------
Testing for: verb
Statistically significant.
5.390132524335926e-26
------------------------------------
Testing for: ingest
Statistically significant.
3.742438320432624e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.92

## Comparação

In [139]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: relativ (Telegram: 17.269614074452733, Whatsapp: 14.976992686865257, Diff: 2.2926213875874755)
Statistically significant.
p-value: 4.763007559869611e-177
Difference: 2.2926213875874755
------------------------------------
Testing for: time (Telegram: 7.826629812759836, Whatsapp: 6.745552975909737, Diff: 1.081076836850099)
Statistically significant.
p-value: 1.388283483879946e-80
Difference: 1.081076836850099
------------------------------------
Testing for: funct (Telegram: 15.863164037583747, Whatsapp: 11.974331509436597, Diff: 3.88883252814715)
Statistically significant.
p-value: 0.0
Difference: 3.88883252814715
------------------------------------
Testing for: auxverb (Telegram: 3.9863743084237715, Whatsapp: 2.4746156763440985, Diff: 1.511758632079673)
Statistically significant.
p-value: 0.0
Difference: 1.511758632079673
------------------------------------
Testing for: present (Telegram: 8.223367246768873, Whatsapp: 6.127906264671661, Diff: 2.0954609820972125)
Statisti

In [140]:

# comparando apenas uma única categoria por vez
# Exemplo: "quantas vezes a categoria funct aparece no Telegram" vs "quantas vezes aparece no WhatsApp".
count = np.array([total_telegram, total_whatsapp])
nobs = np.array([total_word_count_telegram, total_word_count_whatsapp])

z_stat, p_value = proportions_ztest(count, nobs)
ci_low, ci_upp = confint_proportions_2indep(
    count1=count[0], nobs1=nobs[0],
    count2=count[1], nobs2=nobs[1],
    method='wald'
)

print(f"Z = {z_stat:.4f}, p = {p_value:.4f}")
print(f"IC 95%: ({ci_low:.4f}, {ci_upp:.4f})")

if p_value < 0.05:
    print(">> Diferença estatisticamente significativa.")
else:
    print(">> Diferença **não** significativa.")


Z = 0.1732, p = 0.8625
IC 95%: (-0.0000, 0.0000)
>> Diferença **não** significativa.
