In [124]:
#!pip install unidecode

In [125]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, norm
from collections import Counter

import re
import string
from unidecode import unidecode
from string import punctuation

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from math import sqrt
import scipy.stats as stats


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('portuguese'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:


def init_liwc_from_dic(file_path):
    liwc = {}
    category_names = {}
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip():  # Verificando se a linha não está em branco
                parts = line.strip().split('\t')
                word = parts[0]
                categories = parts[1:]
                liwc[word] = categories
                category_names[word] = categories
    return liwc, category_names


In [127]:
liwc_file_path = '../datasets/LIWC2007_Portugues_win.dic'

# Inicializando o LIWC
liwc, category_name = init_liwc_from_dic(liwc_file_path)


In [128]:

def preprocess(texto):
    if isinstance(texto, str):
        tokens = []
        for token in word_tokenize(texto.lower(), language='portuguese'):
            if token not in stop_words and token not in punctuation:
                tokens.append(token)
        return tokens
    else:
        return []


In [129]:
def init_liwc_from_dic_category(file_path):
    liwc = {}
    category_names = {}  # Dicionário para mapear códigos de categoria para nomes de categoria
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip() and not line.startswith('%'):  # Verifica se a linha não está em branco e não é um comentário
                parts = line.strip().split('\t')
                code = parts[0]
                category = parts[1]
                liwc[code] = category
                category_names[code] = category
    return liwc, category_names


# Inicialize o LIWC a partir do arquivo .dic
liwc_category, category_names = init_liwc_from_dic_category(liwc_file_path)

In [130]:
# Função para contar palavras em cada categoria
def count_words_in_categories(texto, liwc, category_names):
    word_counts = Counter()
    total_words = 0

    tokens = preprocess(texto)
    total_words += len(tokens)

    for token in tokens:
        if token in liwc:
            categories = liwc[token]
            for code in categories:
                category_name = category_names.get(code, 'Unknown')
                word_counts[category_name] += 1

    return word_counts, total_words

In [131]:



custom_stopwords = {
    't', 'be', 'nao', 'youtu', 'vai', 'av', 'ja', 'to', 'the', 'this', 'i', 'and', 
    'you', 'y', 'www', 'sao', 'pois', 'contra', 'user', 'ai', 'so', 'gente', 'voce', 'of', 
    'ola', 'gift', 'card', 'kwaivideo', 'r', 'bom', 'q', 'vc', 'vcs', 'pra','ta', 'phone', 'ok', 'la',
    'sera', 'ha', 'aqui', 'ate', 'dia', 'mc', 'im', 'tmj', 'pix', 'g', 'diz', 'ti', 'etc', 'tudo', 
    'todo', 'toda', 'youtube', 'g1', 'm', 'instagram', 'fb', 'in', 'link', 'was', 'blocked', 'kk'
}

stop_words.update(custom_stopwords)

def preprocess_text(text):

    # Função para extrair e substituir o domínio da URL
    def substituir_dominios(texto):
        # Função para extrair e substituir o domínio da URL
        def extrair_dominio(url):
            # Remove o protocolo (http://, https://, etc.) e o "www." se presente
            dominio = re.sub(r'^https?://(?:www\.)?|www\.', '', url)
            # Remove o caminho e parâmetros da URL
            dominio = re.split(r'[/?#]', dominio)[0]
            # Retorna a parte principal do domínio (antes do primeiro ponto)
            return dominio.split('.')[0]

        # Substitui URLs por seus domínios principais
        return re.sub(r'https?://(?:www\.)?\S+|www\.\S+', lambda match: extrair_dominio(match.group(0)), texto)

    # Substituir domínios
    text = substituir_dominios(text)

    # Converte para minúsculas
    text = text.lower()

    # Remove acentos
    text = unidecode(text)

    #Remover Pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs e menções
    #text = re.sub(r'http\S+|www\S+|https\S+|@\w+', '', text)

    # Substitui emojis repetidos por apenas um
    text = re.sub(r'([\U00010000-\U0010FFFF])\1+', r'\1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F700-\U0001F77F]|[\U0001F780-\U0001F7FF]|[\U0001F800-\U0001F8FF]|[\U0001F900-\U0001F9FF]|[\U0001FA00-\U0001FA6F]|[\U0001FA70-\U0001FAFF])\1+', r'\1', text)


    # Remove espaços em branco extras (início ou final) e múltiplos espaços no meio do texto
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove pontuações e caracteres especiais
    #text = re.sub(r'[^\w\s]', '', text)

    # Ajusta risadas "kkk" ou mais para "kk"
    text = re.sub(r'k{2,}|K{2,}', 'kk', text)

    # Ajusta risadas "haha" ou mais para "haha"
    text = re.sub(r'(ha){2,}', 'haha', text, flags=re.IGNORECASE)

    # Ajusta risadas "kaka" ou mais para "kaka"
    text = re.sub(r'(ka){2,}', 'kaka', text, flags=re.IGNORECASE)

    # Remove as stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])


    return text


# Sem filtro

## Telegram

In [132]:
df_telegram = pd.read_csv('../datasets/fakeTelegram.BR_2022.csv')
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
4,2022-10-05 06:27:44,cd6979b0b5265f08468fa1689b6300ce,e56ec342fc599ebb4ed89655eb6f03aa,5ad5c8bbe9da93a37fecf3e5aa5b0637.jpg,image/jpg,,True,False,False,,5,2022-10-05 06:28:29.316325,,,507185,Wanda Silva,Imagem,telegram,,5ad5c8bbe9da93a37fecf3e5aa5b0637


In [133]:
df_telegram.shape

(557586, 20)

In [134]:

# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_telegram = df_telegram.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_telegram = df_telegram[df_telegram['trava_zap'] == False]


In [135]:
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
5,2022-10-05 06:28:30,,b52442a5fbc459ae590dca0d215e32f9,,,,False,False,False,Kķkkkkk to rindo até agora....Quem disse q ia ...,5,2022-10-05 06:29:29.046694,0.7003,0.197813,2735,Wanda Silva,Texto,telegram,,


In [136]:
df_telegram.shape

(444201, 20)

In [137]:

df_geral_telegram = df_telegram.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_telegram['text_processed'] = df_geral_telegram['text_content_anonymous'].apply(preprocess_text)
df_geral_telegram = df_geral_telegram.dropna(subset=['text_processed'])


In [138]:
df_geral_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,...,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,text_processed
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,...,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,,entao fato renato audio ouvi whatsapp ocorreu ...
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",...,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,,saiu presidente 8 horas atras infelizmente con...
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",...,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,,parte quase feita segundo turno completamos pa...
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,...,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb,achei seita maconarica
5,2022-10-05 06:28:30,,b52442a5fbc459ae590dca0d215e32f9,,,,False,False,False,Kķkkkkk to rindo até agora....Quem disse q ia ...,...,2022-10-05 06:29:29.046694,0.7003,0.197813,2735,Wanda Silva,Texto,telegram,,,rindo agoraquem disse ia fazer acordo diabo pr...


In [139]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_geral_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 1936121, 'relativ': 1356169, 'verb': 1202701, 'affect': 1024437, 'funct': 1012885, 'social': 793149, 'swear': 746886, 'humans': 705244, 'achieve': 666326, 'posemo': 658066, 'insight': 643018, 'percept': 626199, 'motion': 604734, 'time': 602658, 'ingest': 597161, 'present': 592340, 'cause': 486698, 'inhib': 425485, 'space': 401743, 'bio': 366033, 'work': 342138, 'tentat': 327783, 'past': 324966, 'negemo': 323446, 'certain': 314796, 'money': 279275, 'feel': 258027, 'quant': 254139, 'auxverb': 243063, 'incl': 236792, 'leisure': 217836, 'discrep': 208386, 'see': 172645, 'anger': 168628, 'adverb': 163028, 'Unknown': 155957, 'hear': 148290, 'relig': 137126, 'pronoun': 121742, 'body': 120769, 'ipron': 113236, 'preps': 107529, 'health': 100937, 'future': 94976, 'sad': 89243, 'sexual': 82421, 'excl': 81717, 'friend': 71100, 'death': 66325, 'anx': 44868, 'conj': 43905, 'ppron': 41022, 'home': 40724, 'number': 40204, 'family': 33809, 'we

In [140]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],  # Substitua 100 pelo total percentual de palavras em cada categoria
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
1.2590495097157846e-26
------------------------------------
Testing for: pronoun
Statistically significant.
7.090187305426212e-43
------------------------------------
Testing for: ipron
Statistically significant.
4.745283729997817e-43
------------------------------------
Testing for: cogmech
Statistically significant.
5.279740056108126e-14
------------------------------------
Testing for: certain
Statistically significant.
5.135642746329521e-39
------------------------------------
Testing for: past
Statistically significant.
8.10439776821461e-39
------------------------------------
Testing for: social
Statistically significant.
2.9174345650659117e-30
------------------------------------
Testing for: insight
Statistically significant.
6.944021826875393e-33
------------------------------------
Testing for: percept
Statistically significant.
3.472500137916604e-33
------------------------------------
Testing for: hear
Statistically significant.

## WhatsApp

In [141]:
df_wpp = pd.read_csv('../datasets/fakeWhatsApp.BR_2022.csv')
df_wpp.shape

(598971, 20)

In [142]:
df_wpp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
2,2022-10-11 00:39:31,c882172d447798d74915973ac83eba68,b84dfe2d1599b82768dcdecce7e6bb23,d2e0ec59ffd9f84764f5b147725d7196.oga,audio/ogg; codecs=opus,,True,False,False,,5,2022-10-11 00:39:33.445125,,,737948BE86D450A426470794F91BC80D,558594228826.0:12@s.whatsapp.net,Audio,whatsapp,,d2e0ec59ffd9f84764f5b147725d7196
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
4,2022-10-10 23:40:12,77c1a8a31fee269db258a028a61f0b88,a5831b109d4d95fae8ee0ce464e48a6a,bb80cd530332bb6b95c34e719992d249.f4v,video/mp4,,True,False,False,,5,2022-10-10 23:40:14.397495,,,092203A082AC3DFB2A4933F60453AEB8,558594228826.0:12@s.whatsapp.net,Video,whatsapp,,bb80cd530332bb6b95c34e719992d249


In [143]:

# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_wpp = df_wpp.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_wpp = df_wpp[df_wpp['trava_zap'] == False]

df_wpp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
7,2022-10-10 16:49:48,325720ed3339a91b8076df12c1e95c45,0e345813dcb62b0fe4d8537f311af0f1,,,https://m.kwai.com/photo/150000006567403/52240...,False,True,False,Fortes palavras da Pastora Damares!\nhttps://k...,5,2022-10-10 16:49:50.051126,0.0,,F18BADED5AFA8FB0C33FE36625872DB8,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
8,2022-10-10 16:49:53,8ed44a70a011285622e2b8919c2c8c3e,3b3e64e81c3e3f3bfc5ba76ddb90fb2d,0d6f2896e5941ef1933e69bbd7a32f69.jpeg,image/jpeg,,True,False,False,🤣🤣🤣🤣🤣🤣🤣🤣,5,2022-10-10 16:49:54.713831,0.0,,724C233591C3A62A5231B47FF22ADE4B,558594228826.0:12@s.whatsapp.net,Imagem,whatsapp,,0d6f2896e5941ef1933e69bbd7a32f69


In [144]:
df_wpp.shape

(255589, 20)

In [145]:

df_geral_whatsapp = df_wpp.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_whatsapp['text_processed'] = df_geral_whatsapp['text_content_anonymous'].apply(preprocess_text)
df_geral_whatsapp = df_geral_whatsapp.dropna(subset=['text_processed'])


In [146]:
df_geral_whatsapp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,...,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,text_processed
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,...,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,,hoje fique sabendo canal lula flix vi pt entro...
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,...,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,aconteca nois fazer anossa parte
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,...,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,acesse entrar grupo whatsapp chat
7,2022-10-10 16:49:48,325720ed3339a91b8076df12c1e95c45,0e345813dcb62b0fe4d8537f311af0f1,,,https://m.kwai.com/photo/150000006567403/52240...,False,True,False,Fortes palavras da Pastora Damares!\nhttps://k...,...,2022-10-10 16:49:50.051126,0.0,,F18BADED5AFA8FB0C33FE36625872DB8,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,fortes palavras pastora damares
8,2022-10-10 16:49:53,8ed44a70a011285622e2b8919c2c8c3e,3b3e64e81c3e3f3bfc5ba76ddb90fb2d,0d6f2896e5941ef1933e69bbd7a32f69.jpeg,image/jpeg,,True,False,False,🤣🤣🤣🤣🤣🤣🤣🤣,...,2022-10-10 16:49:54.713831,0.0,,724C233591C3A62A5231B47FF22ADE4B,558594228826.0:12@s.whatsapp.net,Imagem,whatsapp,,0d6f2896e5941ef1933e69bbd7a32f69,


In [147]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_geral_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 1274771, 'relativ': 916338, 'verb': 829388, 'affect': 681810, 'funct': 663531, 'social': 551233, 'swear': 510740, 'humans': 492322, 'percept': 451544, 'insight': 436150, 'motion': 419707, 'posemo': 417284, 'ingest': 413361, 'achieve': 413097, 'present': 412781, 'time': 384650, 'cause': 313480, 'inhib': 285846, 'space': 269356, 'bio': 265247, 'negemo': 230999, 'work': 225698, 'past': 215837, 'tentat': 209728, 'certain': 193848, 'feel': 185051, 'incl': 170086, 'auxverb': 163272, 'money': 162330, 'quant': 158476, 'leisure': 152984, 'discrep': 139222, 'see': 137949, 'Unknown': 135005, 'anger': 120465, 'hear': 106626, 'adverb': 98270, 'relig': 94073, 'body': 88954, 'pronoun': 81623, 'ipron': 75874, 'health': 75271, 'sad': 63762, 'preps': 57734, 'excl': 57583, 'future': 57424, 'sexual': 56498, 'friend': 52350, 'death': 49349, 'home': 33528, 'number': 33184, 'family': 31476, 'conj': 31395, 'anx': 31216, 'ppron': 27287, 'we': 21327, '

In [148]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
2.289884962320822e-24
------------------------------------
Testing for: time
Statistically significant.
9.422524885657864e-36
------------------------------------
Testing for: present
Statistically significant.
4.3002938779305865e-35
------------------------------------
Testing for: cogmech
Statistically significant.
5.694167541454899e-18
------------------------------------
Testing for: inhib
Statistically significant.
4.0546973773597913e-38
------------------------------------
Testing for: incl
Statistically significant.
5.441623064461292e-41
------------------------------------
Testing for: bio
Statistically significant.
1.2727602855992238e-38
------------------------------------
Testing for: verb
Statistically significant.
4.499304955111294e-26
------------------------------------
Testing for: sexual
Statistically significant.
6.494067109282451e-44
------------------------------------
Testing for: social
Statistically significant.
6.1

## Comparação

In [149]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: funct (Telegram: 12.239543337071671, Whatsapp: 10.12818152760949, Diff: 2.1113618094621813)
Statistically significant.
p-value: 0.0
Difference: 2.1113618094621813
------------------------------------
Testing for: pronoun (Telegram: 1.47111121691187, Whatsapp: 1.2458989268445175, Diff: 0.22521229006735255)
Statistically significant.
p-value: 4.255128619663561e-300
Difference: 0.22521229006735255
------------------------------------
Testing for: ipron (Telegram: 1.368326048185774, Whatsapp: 1.1581458066402963, Diff: 0.21018024154547765)
Statistically significant.
p-value: 6.091335848536537e-281
Difference: 0.21018024154547765
------------------------------------
Testing for: cogmech (Telegram: 23.395782231264697, Whatsapp: 19.458189736624632, Diff: 3.9375924946400644)
Statistically significant.
p-value: 0.0
Difference: 3.9375924946400644
------------------------------------
Testing for: certain (Telegram: 3.8039454472490104, Whatsapp: 2.958908826812982, Diff: 0.8450366204360

### Calculando intervalo de Confiança

In [150]:

sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })

# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("./resultados_liwc/comparacao_proporcoes_categorias_Semfiltro.csv", index=False, sep=';')

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")
print(f"Intervalo de confiança [", ci_low, ci_high , "]")


=== Resumo Final ===
Total de categorias comparadas: 67
Categorias com diferença significativa (p <= 0.05): 65
Categorias sem diferença significativa: 2
Intervalo de confiança [ -8.742922070074364e-07 2.222525759968265e-06 ]


# Filtro religioso

## Telegram

In [151]:
palavras_religiosas = [
    "deus", "jesus", "misericordia", "davi",
    "salomao", "reino", "templo", "conservador",
    "pentecostal", "rcc", "renovacao", "carismatic",
    "paulo ricardo", "bernardo kuster", "herege", "ateu",
    "jerico", "heresia"
]


In [152]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_telegram = df_geral_telegram[df_geral_telegram['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [153]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_religiao_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 391845, 'relativ': 253808, 'verb': 237400, 'affect': 223393, 'funct': 217206, 'social': 172598, 'swear': 172099, 'humans': 155646, 'posemo': 141808, 'percept': 134279, 'insight': 133899, 'achieve': 126550, 'time': 116801, 'ingest': 111518, 'present': 109807, 'motion': 108964, 'cause': 96413, 'inhib': 90588, 'space': 76159, 'bio': 73771, 'tentat': 72694, 'negemo': 70282, 'past': 70214, 'certain': 67620, 'work': 66427, 'relig': 63535, 'quant': 62409, 'feel': 56683, 'incl': 48664, 'auxverb': 48318, 'discrep': 45067, 'money': 42336, 'adverb': 37027, 'anger': 36535, 'leisure': 35723, 'see': 33728, 'hear': 32065, 'pronoun': 29538, 'ipron': 26585, 'health': 22398, 'sad': 20990, 'friend': 20767, 'Unknown': 20734, 'body': 19456, 'preps': 18384, 'future': 17877, 'sexual': 17077, 'death': 15922, 'excl': 14908, 'anx': 11525, 'conj': 10582, 'family': 9474, 'ppron': 8646, 'number': 7462, 'negate': 7459, 'home': 6905, 'nonfl': 6361, 'we': 56

In [154]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
9.606686822322708e-24
------------------------------------
Testing for: present
Statistically significant.
1.500096666780847e-33
------------------------------------
Testing for: quant
Statistically significant.
1.9300000166004725e-38
------------------------------------
Testing for: swear
Statistically significant.
1.2001376556460403e-27
------------------------------------
Testing for: social
Statistically significant.
1.330812642665827e-27
------------------------------------
Testing for: affect
Statistically significant.
3.1165467016778656e-23
------------------------------------
Testing for: posemo
Statistically significant.
1.918789532430515e-30
------------------------------------
Testing for: negemo
Statistically significant.
1.3236999711659119e-37
------------------------------------
Testing for: anger
Statistically significant.
2.953128966010713e-41
------------------------------------
Testing for: verb
Statistically significant.


## WhatsApp

In [155]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [156]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_religiao_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 359728, 'relativ': 261447, 'verb': 226949, 'affect': 203414, 'funct': 201316, 'social': 158238, 'swear': 148254, 'humans': 145659, 'posemo': 125932, 'percept': 123497, 'achieve': 119860, 'time': 117971, 'ingest': 115311, 'insight': 113557, 'motion': 112123, 'present': 104606, 'cause': 93874, 'space': 83055, 'inhib': 80753, 'bio': 73387, 'negemo': 68143, 'past': 66952, 'work': 62558, 'tentat': 61360, 'certain': 59662, 'feel': 50967, 'relig': 50332, 'incl': 47685, 'quant': 47116, 'money': 46746, 'auxverb': 45924, 'Unknown': 41715, 'discrep': 39222, 'leisure': 38527, 'anger': 36152, 'see': 32106, 'adverb': 31298, 'hear': 27558, 'pronoun': 26651, 'ipron': 24236, 'health': 23509, 'sad': 21690, 'body': 21061, 'preps': 20060, 'sexual': 17673, 'friend': 17544, 'future': 17498, 'death': 16505, 'excl': 14141, 'family': 11602, 'conj': 10904, 'number': 10470, 'anx': 9037, 'home': 8490, 'ppron': 7321, 'negate': 5927, 'we': 4880, 'nonfl': 3

In [157]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
7.527336490437916e-34
------------------------------------
Testing for: funct
Statistically significant.
9.680334523729117e-27
------------------------------------
Testing for: past
Statistically significant.
1.2407860721838117e-38
------------------------------------
Testing for: social
Statistically significant.
2.6326476996786773e-30
------------------------------------
Testing for: affect
Statistically significant.
1.424248364499519e-26
------------------------------------
Testing for: negemo
Statistically significant.
1.6184000084097491e-38
------------------------------------
Testing for: anger
Statistically significant.
1.1159187359757767e-41
------------------------------------
Testing for: verb
Statistically significant.
9.928769553323461e-25
------------------------------------
Testing for: ingest
Statistically significant.
4.318810294427552e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.

## Comparação

In [158]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")


Testing for: funct (Telegram: 14.48346083805711, Whatsapp: 12.153372670185009, Diff: 2.330088167872102)
Statistically significant.
p-value: 0.0
Difference: 2.330088167872102
------------------------------------
Testing for: present (Telegram: 7.322014052303054, Whatsapp: 6.315025638982361, Diff: 1.006988413320693)
Statistically significant.
p-value: 4.402290839398139e-276
Difference: 1.006988413320693
------------------------------------
Testing for: quant (Telegram: 4.161479459325737, Whatsapp: 2.84437554257206, Diff: 1.3171039167536773)
Statistically significant.
p-value: 0.0
Difference: 1.3171039167536773
------------------------------------
Testing for: swear (Telegram: 11.475691862880355, Whatsapp: 8.950039300629896, Diff: 2.5256525622504586)
Statistically significant.
p-value: 0.0
Difference: 2.5256525622504586
------------------------------------
Testing for: social (Telegram: 11.50896556138864, Whatsapp: 9.552769698308804, Diff: 1.9561958630798362)
Statistically significant.
p-

## Calculando intervalo de confiança

In [159]:



sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })

# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("./resultados_liwc/comparacao_proporcoes_categorias_filtroReligioso.csv", index=False, sep=';')

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")
print(f"Intervalo de confiança [", ci_low, ci_high , "]")


=== Resumo Final ===
Total de categorias comparadas: 66
Categorias com diferença significativa (p <= 0.05): 62
Categorias sem diferença significativa: 4
Intervalo de confiança [ -8.549066510213168e-06 -9.838519311136483e-07 ]


# Filtro religioso com exclusão de palavras politicas

## Telegram

In [160]:
palavras_politicas = [ "lula", "bolsonaro", "pt", "pl", "stf", "patria", "55", "22", "13", "senadores", "lulaladrao",
                       "urnas", "alexandre", "moraes", "comunismo", "eleicao", "eleicoes", "esquerda", "direita",
                         "presidente", "tse", "fraude", "voto", "turno", "ministro"]

In [161]:
def retirar_mensagens_com_palavras_politicas(word):
    word_lower = word.lower()
    palavras_politica_lower = [palavra.lower() for palavra in palavras_politicas]

    return any(palavra in word_lower for palavra in palavras_politica_lower)


df_politico_telegram = df_religiao_telegram[~df_religiao_telegram['text_processed'].apply(lambda x: retirar_mensagens_com_palavras_politicas(x))]



In [162]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_politico_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 51690, 'relativ': 35042, 'verb': 34398, 'affect': 30631, 'funct': 30437, 'social': 24508, 'humans': 22397, 'swear': 21106, 'posemo': 20456, 'percept': 19041, 'relig': 17568, 'insight': 17075, 'present': 16611, 'motion': 16293, 'achieve': 16201, 'time': 16122, 'ingest': 14481, 'cause': 13033, 'inhib': 12115, 'bio': 11252, 'tentat': 11228, 'negemo': 9208, 'certain': 8928, 'past': 8815, 'auxverb': 8624, 'space': 8612, 'work': 7572, 'incl': 6593, 'feel': 6531, 'quant': 6383, 'money': 6187, 'discrep': 5801, 'hear': 5798, 'leisure': 5357, 'anger': 4990, 'adverb': 4990, 'pronoun': 4628, 'see': 4535, 'ipron': 3864, 'health': 3477, 'sexual': 3458, 'body': 2934, 'future': 2910, 'sad': 2792, 'friend': 2768, 'death': 2548, 'preps': 2153, 'excl': 1998, 'ppron': 1564, 'anx': 1520, 'family': 1422, 'conj': 1390, 'Unknown': 1169, 'negate': 1165, 'home': 940, 'we': 788, 'nonfl': 635, 'number': 584, 'i': 510, 'shehe': 453, 'assent': 415, 'filler

In [163]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
3.9265806780755145e-19
------------------------------------
Testing for: time
Statistically significant.
7.205232632322371e-32
------------------------------------
Testing for: funct
Statistically significant.
6.4078267273564035e-22
------------------------------------
Testing for: auxverb
Statistically significant.
7.400620137282743e-38
------------------------------------
Testing for: present
Statistically significant.
1.6967562465149554e-31
------------------------------------
Testing for: future
Statistically significant.
8.839479904453172e-43
------------------------------------
Testing for: cogmech
Statistically significant.
9.991583633190339e-11
------------------------------------
Testing for: verb
Statistically significant.
1.6454809077587692e-19
------------------------------------
Testing for: cause
Statistically significant.
2.8533912752342594e-34
------------------------------------
Testing for: motion
Statistically significa

## WhatsApp

In [164]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_politico_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [165]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_politico_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 359728, 'relativ': 261447, 'verb': 226949, 'affect': 203414, 'funct': 201316, 'social': 158238, 'swear': 148254, 'humans': 145659, 'posemo': 125932, 'percept': 123497, 'achieve': 119860, 'time': 117971, 'ingest': 115311, 'insight': 113557, 'motion': 112123, 'present': 104606, 'cause': 93874, 'space': 83055, 'inhib': 80753, 'bio': 73387, 'negemo': 68143, 'past': 66952, 'work': 62558, 'tentat': 61360, 'certain': 59662, 'feel': 50967, 'relig': 50332, 'incl': 47685, 'quant': 47116, 'money': 46746, 'auxverb': 45924, 'Unknown': 41715, 'discrep': 39222, 'leisure': 38527, 'anger': 36152, 'see': 32106, 'adverb': 31298, 'hear': 27558, 'pronoun': 26651, 'ipron': 24236, 'health': 23509, 'sad': 21690, 'body': 21061, 'preps': 20060, 'sexual': 17673, 'friend': 17544, 'future': 17498, 'death': 16505, 'excl': 14141, 'family': 11602, 'conj': 10904, 'number': 10470, 'anx': 9037, 'home': 8490, 'ppron': 7321, 'negate': 5927, 'we': 4880, 'nonfl': 3

In [166]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
7.527336490437916e-34
------------------------------------
Testing for: funct
Statistically significant.
9.680334523729117e-27
------------------------------------
Testing for: past
Statistically significant.
1.2407860721838117e-38
------------------------------------
Testing for: social
Statistically significant.
2.6326476996786773e-30
------------------------------------
Testing for: affect
Statistically significant.
1.424248364499519e-26
------------------------------------
Testing for: negemo
Statistically significant.
1.6184000084097491e-38
------------------------------------
Testing for: anger
Statistically significant.
1.1159187359757767e-41
------------------------------------
Testing for: verb
Statistically significant.
9.928769553323461e-25
------------------------------------
Testing for: ingest
Statistically significant.
4.318810294427552e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.

## Comparação

In [167]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: relativ (Telegram: 18.395812881583716, Whatsapp: 15.783458962535816, Diff: 2.6123539190478997)
Statistically significant.
p-value: 3.66248433248032e-190
Difference: 2.6123539190478997
------------------------------------
Testing for: time (Telegram: 8.46348083091412, Whatsapp: 7.1218657596733275, Diff: 1.3416150712407928)
Statistically significant.
p-value: 2.5066203450993038e-101
Difference: 1.3416150712407928
------------------------------------
Testing for: funct (Telegram: 15.978350455931839, Whatsapp: 12.153372670185009, Diff: 3.82497778574683)
Statistically significant.
p-value: 0.0
Difference: 3.82497778574683
------------------------------------
Testing for: auxverb (Telegram: 4.527295539374977, Whatsapp: 2.772414942208152, Diff: 1.7548805971668253)
Statistically significant.
p-value: 0.0
Difference: 1.7548805971668253
------------------------------------
Testing for: present (Telegram: 8.720188567318848, Whatsapp: 6.315025638982361, Diff: 2.4051629283364875)
Stati

### Calculando Intervalo de confiança

In [168]:

sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []
diferencas = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2
        diferencas.append(diff)

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })

# === Intervalo de confiança geral das diferenças ===
media_diff = np.mean(diferencas)
std_diff = np.std(diferencas, ddof=1)  # amostral
n = len(diferencas)
se_media = std_diff / sqrt(n)
z = norm.ppf(0.975)
ic_inf = media_diff - z * se_media
ic_sup = media_diff + z * se_media

# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("./resultados_liwc/comparacao_proporcoes_categorias_filtroPolitico.csv", index=False, sep=';')

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")
print(f"IC 95%: [{ic_inf:.6f}, {ic_sup:.6f}]")


=== Resumo Final ===
Total de categorias comparadas: 64
Categorias com diferença significativa (p <= 0.05): 62
Categorias sem diferença significativa: 2
IC 95%: [0.007380, 0.014444]
