In [17]:
#!pip install unidecode

In [18]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, norm
from collections import Counter

import re
import string
from unidecode import unidecode
from string import punctuation

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from math import sqrt
import scipy.stats as stats


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('portuguese'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Melissa
[nltk_data]     Felipe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:


def init_liwc_from_dic(file_path):
    liwc = {}
    category_names = {}
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip():  # Verificando se a linha não está em branco
                parts = line.strip().split('\t')
                word = parts[0]
                categories = parts[1:]
                liwc[word] = categories
                category_names[word] = categories
    return liwc, category_names


In [20]:
liwc_file_path = '../datasets/LIWC2007_Portugues_win.dic'

# Inicializando o LIWC
liwc, category_name = init_liwc_from_dic(liwc_file_path)


In [21]:

def preprocess(texto):
    if isinstance(texto, str):
        tokens = []
        for token in word_tokenize(texto.lower(), language='portuguese'):
            if token not in stop_words and token not in punctuation:
                tokens.append(token)
        return tokens
    else:
        return []


In [22]:
def init_liwc_from_dic_category(file_path):
    liwc = {}
    category_names = {}  # Dicionário para mapear códigos de categoria para nomes de categoria
    with open(file_path, 'r', encoding='latin1') as file:
        for line in file:
            if line.strip() and not line.startswith('%'):  # Verifica se a linha não está em branco e não é um comentário
                parts = line.strip().split('\t')
                code = parts[0]
                category = parts[1]
                liwc[code] = category
                category_names[code] = category
    return liwc, category_names


# Inicialize o LIWC a partir do arquivo .dic
liwc_category, category_names = init_liwc_from_dic_category(liwc_file_path)

In [23]:
# Função para contar palavras em cada categoria
def count_words_in_categories(texto, liwc, category_names):
    word_counts = Counter()
    total_words = 0

    tokens = preprocess(texto)
    total_words += len(tokens)

    for token in tokens:
        if token in liwc:
            categories = liwc[token]
            for code in categories:
                category_name = category_names.get(code, 'Unknown')
                word_counts[category_name] += 1

    return word_counts, total_words

In [24]:



custom_stopwords = {
    't', 'be', 'nao', 'youtu', 'vai', 'av', 'ja', 'to', 'the', 'this', 'i', 'and', 
    'you', 'y', 'www', 'sao', 'pois', 'contra', 'user', 'ai', 'so', 'gente', 'voce', 'of', 
    'ola', 'gift', 'card', 'kwaivideo', 'r', 'bom', 'q', 'vc', 'vcs', 'pra','ta', 'phone', 'ok', 'la',
    'sera', 'ha', 'aqui', 'ate', 'dia', 'mc', 'im', 'tmj', 'pix', 'g', 'diz', 'ti', 'etc', 'tudo', 
    'todo', 'toda', 'youtube', 'g1', 'm', 'instagram', 'fb', 'in', 'link', 'was', 'blocked', 'kk'
}

stop_words.update(custom_stopwords)

def preprocess_text(text):

    # Função para extrair e substituir o domínio da URL
    def substituir_dominios(texto):
        # Função para extrair e substituir o domínio da URL
        def extrair_dominio(url):
            # Remove o protocolo (http://, https://, etc.) e o "www." se presente
            dominio = re.sub(r'^https?://(?:www\.)?|www\.', '', url)
            # Remove o caminho e parâmetros da URL
            dominio = re.split(r'[/?#]', dominio)[0]
            # Retorna a parte principal do domínio (antes do primeiro ponto)
            return dominio.split('.')[0]

        # Substitui URLs por seus domínios principais
        return re.sub(r'https?://(?:www\.)?\S+|www\.\S+', lambda match: extrair_dominio(match.group(0)), texto)

    # Substituir domínios
    text = substituir_dominios(text)

    # Converte para minúsculas
    text = text.lower()

    # Remove acentos
    text = unidecode(text)

    #Remover Pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs e menções
    #text = re.sub(r'http\S+|www\S+|https\S+|@\w+', '', text)

    # Substitui emojis repetidos por apenas um
    text = re.sub(r'([\U00010000-\U0010FFFF])\1+', r'\1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F700-\U0001F77F]|[\U0001F780-\U0001F7FF]|[\U0001F800-\U0001F8FF]|[\U0001F900-\U0001F9FF]|[\U0001FA00-\U0001FA6F]|[\U0001FA70-\U0001FAFF])\1+', r'\1', text)


    # Remove espaços em branco extras (início ou final) e múltiplos espaços no meio do texto
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove pontuações e caracteres especiais
    #text = re.sub(r'[^\w\s]', '', text)

    # Ajusta risadas "kkk" ou mais para "kk"
    text = re.sub(r'k{2,}|K{2,}', 'kk', text)

    # Ajusta risadas "haha" ou mais para "haha"
    text = re.sub(r'(ha){2,}', 'haha', text, flags=re.IGNORECASE)

    # Ajusta risadas "kaka" ou mais para "kaka"
    text = re.sub(r'(ka){2,}', 'kaka', text, flags=re.IGNORECASE)

    # Remove as stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])


    return text


# Sem filtro

## Telegram

In [25]:
df_telegram = pd.read_csv('../datasets/fakeTelegram.BR_2022.csv')
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
4,2022-10-05 06:27:44,cd6979b0b5265f08468fa1689b6300ce,e56ec342fc599ebb4ed89655eb6f03aa,5ad5c8bbe9da93a37fecf3e5aa5b0637.jpg,image/jpg,,True,False,False,,5,2022-10-05 06:28:29.316325,,,507185,Wanda Silva,Imagem,telegram,,5ad5c8bbe9da93a37fecf3e5aa5b0637


In [26]:
df_telegram.shape

(557586, 20)

In [27]:

# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_telegram = df_telegram.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_telegram = df_telegram[df_telegram['trava_zap'] == False]


In [28]:
df_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,5,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",5,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",5,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,5,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb
5,2022-10-05 06:28:30,,b52442a5fbc459ae590dca0d215e32f9,,,,False,False,False,Kķkkkkk to rindo até agora....Quem disse q ia ...,5,2022-10-05 06:29:29.046694,0.7003,0.197813,2735,Wanda Silva,Texto,telegram,,


In [29]:
df_telegram.shape

(444201, 20)

In [30]:

df_geral_telegram = df_telegram.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_telegram['text_processed'] = df_geral_telegram['text_content_anonymous'].apply(preprocess_text)
df_geral_telegram = df_geral_telegram.dropna(subset=['text_processed'])


In [31]:
df_geral_telegram.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,...,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,text_processed
0,2022-10-05 06:25:04,1078cc958f0febe28f4d03207660715f,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,Então é Fato Renato o áudio que eu ouvi no wha...,...,2022-10-05 06:25:28.863641,0.0,,16385,Wanda Silva,Texto,telegram,,,entao fato renato audio ouvi whatsapp ocorreu ...
1,2022-10-05 06:25:08,,12283e08a2eb5789201e105b34489ee7,,,,False,False,False,"Saiu no YouTube do presidente a 8 horas atrás,...",...,2022-10-05 06:25:28.926311,0.0644,,16386,Wanda Silva,Texto,telegram,,,saiu presidente 8 horas atras infelizmente con...
2,2022-10-05 06:26:28,92a2d8fd7144074f659d1d29dc3751da,9f2d7394334eb224c061c9740b5748fc,,,,False,False,False,"É isso, nossa parte já foi quase toda feita. N...",...,2022-10-05 06:26:29.361949,-0.3551,0.157242,16366,Wanda Silva,Texto,telegram,,,parte quase feita segundo turno completamos pa...
3,2022-10-05 06:27:28,d60aa38f62b4977426b70944af4aff72,c8f2de56550ed0bf85249608b7ead93d,94dca4cda503100ebfda7ce2bcc060eb.jpg,image/jpg,,True,False,False,GENTE ACHEI ELES EM UMA SEITA MAÇONÁRICA,...,2022-10-05 06:27:29.935624,0.0,,19281,Wanda Silva,Imagem,telegram,,94dca4cda503100ebfda7ce2bcc060eb,achei seita maconarica
5,2022-10-05 06:28:30,,b52442a5fbc459ae590dca0d215e32f9,,,,False,False,False,Kķkkkkk to rindo até agora....Quem disse q ia ...,...,2022-10-05 06:29:29.046694,0.7003,0.197813,2735,Wanda Silva,Texto,telegram,,,rindo agoraquem disse ia fazer acordo diabo pr...


In [32]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_geral_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 1936130, 'relativ': 1356178, 'verb': 1202709, 'affect': 1024446, 'funct': 1012891, 'social': 793156, 'swear': 746894, 'humans': 705253, 'achieve': 666330, 'posemo': 658069, 'insight': 643019, 'percept': 626206, 'motion': 604737, 'time': 602664, 'ingest': 597165, 'present': 592345, 'cause': 486700, 'inhib': 425487, 'space': 401747, 'bio': 366039, 'work': 342139, 'tentat': 327784, 'past': 324970, 'negemo': 323451, 'certain': 314797, 'money': 279278, 'feel': 258029, 'quant': 254140, 'auxverb': 243066, 'incl': 236793, 'leisure': 217836, 'discrep': 208388, 'see': 172646, 'anger': 168629, 'adverb': 163028, 'Unknown': 155958, 'hear': 148291, 'relig': 137126, 'pronoun': 121743, 'body': 120769, 'ipron': 113237, 'preps': 107529, 'health': 100939, 'future': 94978, 'sad': 89246, 'sexual': 82421, 'excl': 81717, 'friend': 71101, 'death': 66325, 'anx': 44868, 'conj': 43905, 'ppron': 41022, 'home': 40724, 'number': 40204, 'family': 33813, 'we

In [33]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],  # Substitua 100 pelo total percentual de palavras em cada categoria
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
1.2699982337229533e-26
------------------------------------
Testing for: pronoun
Statistically significant.
7.0997385359795396e-43
------------------------------------
Testing for: ipron
Statistically significant.
4.751257380905148e-43
------------------------------------
Testing for: cogmech
Statistically significant.
5.341839881351528e-14
------------------------------------
Testing for: certain
Statistically significant.
5.152330492376576e-39
------------------------------------
Testing for: past
Statistically significant.
8.132594121323714e-39
------------------------------------
Testing for: social
Statistically significant.
2.9389265858494075e-30
------------------------------------
Testing for: insight
Statistically significant.
6.985961537815689e-33
------------------------------------
Testing for: percept
Statistically significant.
3.493889665062881e-33
------------------------------------
Testing for: hear
Statistically significan

## WhatsApp

In [34]:
df_wpp = pd.read_csv('../datasets/fakeWhatsApp.BR_2022.csv')
df_wpp.shape

(598971, 20)

In [35]:
df_wpp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
2,2022-10-11 00:39:31,c882172d447798d74915973ac83eba68,b84dfe2d1599b82768dcdecce7e6bb23,d2e0ec59ffd9f84764f5b147725d7196.oga,audio/ogg; codecs=opus,,True,False,False,,5,2022-10-11 00:39:33.445125,,,737948BE86D450A426470794F91BC80D,558594228826.0:12@s.whatsapp.net,Audio,whatsapp,,d2e0ec59ffd9f84764f5b147725d7196
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
4,2022-10-10 23:40:12,77c1a8a31fee269db258a028a61f0b88,a5831b109d4d95fae8ee0ce464e48a6a,bb80cd530332bb6b95c34e719992d249.f4v,video/mp4,,True,False,False,,5,2022-10-10 23:40:14.397495,,,092203A082AC3DFB2A4933F60453AEB8,558594228826.0:12@s.whatsapp.net,Video,whatsapp,,bb80cd530332bb6b95c34e719992d249


In [36]:

# Removendo linhas com valores NaN na coluna 'text_content_anonymous'
df_wpp = df_wpp.dropna(subset=['text_content_anonymous'])

# Removendo trava_zap
df_wpp = df_wpp[df_wpp['trava_zap'] == False]

df_wpp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,5,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,5,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,5,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
7,2022-10-10 16:49:48,325720ed3339a91b8076df12c1e95c45,0e345813dcb62b0fe4d8537f311af0f1,,,https://m.kwai.com/photo/150000006567403/52240...,False,True,False,Fortes palavras da Pastora Damares!\nhttps://k...,5,2022-10-10 16:49:50.051126,0.0,,F18BADED5AFA8FB0C33FE36625872DB8,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,
8,2022-10-10 16:49:53,8ed44a70a011285622e2b8919c2c8c3e,3b3e64e81c3e3f3bfc5ba76ddb90fb2d,0d6f2896e5941ef1933e69bbd7a32f69.jpeg,image/jpeg,,True,False,False,🤣🤣🤣🤣🤣🤣🤣🤣,5,2022-10-10 16:49:54.713831,0.0,,724C233591C3A62A5231B47FF22ADE4B,558594228826.0:12@s.whatsapp.net,Imagem,whatsapp,,0d6f2896e5941ef1933e69bbd7a32f69


In [37]:
df_wpp.shape

(255589, 20)

In [38]:

df_geral_whatsapp = df_wpp.copy()

# Aplicar o pré-processamento à coluna de texto
df_geral_whatsapp['text_processed'] = df_geral_whatsapp['text_content_anonymous'].apply(preprocess_text)
df_geral_whatsapp = df_geral_whatsapp.dropna(subset=['text_processed'])


In [39]:
df_geral_whatsapp.head()

Unnamed: 0,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,trava_zap,text_content_anonymous,...,date_system,score_sentiment,score_misinformation,id_message,id_persona,message_type,messenger,media_name,media_md5,text_processed
0,2022-10-10 18:20:24,9d737b3c9387855139bbad2311cc5709,0638569ee76dac58f59dcac20463c955,,,,False,False,False,☝️\nHoje que eu fique sabendo do CANAL LULA FL...,...,2022-10-10 18:20:25.000937,-0.7003,0.843775,F7023FFB06C429A2C166922849A35ED8,558594228826.0:12@s.whatsapp.net,Texto,whatsapp,,,hoje fique sabendo canal lula flix vi pt entro...
1,2022-10-10 22:02:58,1660a60f661754d2802ca53296e25be8,a5910d5cc1c830ade9eb4dd00f15ff6a,,,,False,False,False,Mais pra que isso não aconteça nois temos quê ...,...,2022-10-10 22:02:58.4682,-0.296,,A9FAC78070C144890D181EF415B90CAD,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,aconteca nois fazer anossa parte
3,2022-10-10 23:36:19,91e3c22c08b24ba01ac4524d77bcb1da,addb88a34374d43aa9ecd4df7359ce39,,,,False,False,False,‎Acesse este link para entrar no meu grupo do ...,...,2022-10-10 23:36:19.724987,-0.1531,,439A91ADD8F355CD23C4BB107A5E88BB,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,acesse entrar grupo whatsapp chat
7,2022-10-10 16:49:48,325720ed3339a91b8076df12c1e95c45,0e345813dcb62b0fe4d8537f311af0f1,,,https://m.kwai.com/photo/150000006567403/52240...,False,True,False,Fortes palavras da Pastora Damares!\nhttps://k...,...,2022-10-10 16:49:50.051126,0.0,,F18BADED5AFA8FB0C33FE36625872DB8,558594228826.0:12@s.whatsapp.net,TextoExtendido,whatsapp,,,fortes palavras pastora damares
8,2022-10-10 16:49:53,8ed44a70a011285622e2b8919c2c8c3e,3b3e64e81c3e3f3bfc5ba76ddb90fb2d,0d6f2896e5941ef1933e69bbd7a32f69.jpeg,image/jpeg,,True,False,False,🤣🤣🤣🤣🤣🤣🤣🤣,...,2022-10-10 16:49:54.713831,0.0,,724C233591C3A62A5231B47FF22ADE4B,558594228826.0:12@s.whatsapp.net,Imagem,whatsapp,,0d6f2896e5941ef1933e69bbd7a32f69,


In [40]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_geral_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 1274772, 'relativ': 916338, 'verb': 829389, 'affect': 681815, 'funct': 663532, 'social': 551240, 'swear': 510741, 'humans': 492324, 'percept': 451545, 'insight': 436150, 'motion': 419707, 'posemo': 417286, 'ingest': 413361, 'achieve': 413099, 'present': 412782, 'time': 384650, 'cause': 313481, 'inhib': 285846, 'space': 269356, 'bio': 265249, 'negemo': 230999, 'work': 225701, 'past': 215837, 'tentat': 209728, 'certain': 193848, 'feel': 185051, 'incl': 170086, 'auxverb': 163272, 'money': 162330, 'quant': 158476, 'leisure': 152984, 'discrep': 139222, 'see': 137949, 'Unknown': 135005, 'anger': 120465, 'hear': 106626, 'adverb': 98271, 'relig': 94073, 'body': 88954, 'pronoun': 81623, 'ipron': 75874, 'health': 75271, 'sad': 63762, 'preps': 57734, 'excl': 57583, 'future': 57424, 'sexual': 56499, 'friend': 52350, 'death': 49349, 'home': 33528, 'number': 33184, 'family': 31476, 'conj': 31395, 'anx': 31216, 'ppron': 27287, 'we': 21327, '

In [41]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
2.294396782891576e-24
------------------------------------
Testing for: time
Statistically significant.
9.432040282181946e-36
------------------------------------
Testing for: present
Statistically significant.
4.3051407401552187e-35
------------------------------------
Testing for: cogmech
Statistically significant.
5.70766949628206e-18
------------------------------------
Testing for: inhib
Statistically significant.
4.0578426096331844e-38
------------------------------------
Testing for: incl
Statistically significant.
5.444230463569712e-41
------------------------------------
Testing for: bio
Statistically significant.
1.2738264245751104e-38
------------------------------------
Testing for: verb
Statistically significant.
4.507826072177068e-26
------------------------------------
Testing for: sexual
Statistically significant.
6.495529719886013e-44
------------------------------------
Testing for: social
Statistically significant.
6.11

## Comparação

In [42]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: funct (Telegram: 12.242385184467153, Whatsapp: 10.12868688937916, Diff: 2.1136982950879926)
Statistically significant.
p-value: 0.0
Difference: 2.1136982950879926
------------------------------------
Testing for: pronoun (Telegram: 1.4714561581775183, Whatsapp: 1.2459592151875043, Diff: 0.225496942990014)
Statistically significant.
p-value: 9.08197241598167e-301
Difference: 0.225496942990014
------------------------------------
Testing for: ipron (Telegram: 1.3686477332047644, Whatsapp: 1.1582018486595287, Diff: 0.21044588454523572)
Statistically significant.
p-value: 1.436304919948257e-281
Difference: 0.21044588454523572
------------------------------------
Testing for: cogmech (Telegram: 23.401184557077105, Whatsapp: 19.45914657220398, Diff: 3.9420379848731244)
Statistically significant.
p-value: 0.0
Difference: 3.9420379848731244
------------------------------------
Testing for: certain (Telegram: 3.8048182172758036, Whatsapp: 2.959052006734222, Diff: 0.8457662105415817

### Calculando intervalo de Confiança

In [43]:

sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []
diferencas = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2
        diferencas.append(diff)

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })


# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("../resultados/resultados_liwc/comparacao_proporcoes_categorias_Semfiltro.csv", index=False, sep=';')

menor_valor = df_resultados['ic_95_inf'].min()
maior_valor = df_resultados['ic_95_sup'].max()

print(f"Menor valor inferior dos ICs: {menor_valor:.6f}")
print(f"Maior valor superior dos ICs: {maior_valor:.6f}")

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")


Menor valor inferior dos ICs: -0.001900
Maior valor superior dos ICs: 0.039800

=== Resumo Final ===
Total de categorias comparadas: 67
Categorias com diferença significativa (p <= 0.05): 65
Categorias sem diferença significativa: 2


# Filtro religioso

## Telegram

In [44]:
palavras_religiosas = [
    "deus", "jesus", "misericordia", "davi",
    "salomao", "reino", "templo", "conservador",
    "pentecostal", "rcc", "renovacao", "carismatic",
    "paulo ricardo", "bernardo kuster", "herege", "ateu",
    "jerico", "heresia"
]


In [45]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_telegram = df_geral_telegram[df_geral_telegram['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [46]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_religiao_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 391847, 'relativ': 253808, 'verb': 237402, 'affect': 223394, 'funct': 217208, 'social': 172598, 'swear': 172099, 'humans': 155646, 'posemo': 141808, 'percept': 134279, 'insight': 133899, 'achieve': 126550, 'time': 116801, 'ingest': 111518, 'present': 109809, 'motion': 108964, 'cause': 96413, 'inhib': 90588, 'space': 76159, 'bio': 73771, 'tentat': 72694, 'negemo': 70283, 'past': 70216, 'certain': 67620, 'work': 66427, 'relig': 63535, 'quant': 62409, 'feel': 56683, 'incl': 48664, 'auxverb': 48320, 'discrep': 45069, 'money': 42338, 'adverb': 37027, 'anger': 36535, 'leisure': 35723, 'see': 33728, 'hear': 32065, 'pronoun': 29538, 'ipron': 26585, 'health': 22398, 'sad': 20990, 'friend': 20767, 'Unknown': 20734, 'body': 19456, 'preps': 18384, 'future': 17879, 'sexual': 17077, 'death': 15922, 'excl': 14908, 'anx': 11525, 'conj': 10582, 'family': 9474, 'ppron': 8646, 'number': 7462, 'negate': 7459, 'home': 6905, 'nonfl': 6361, 'we': 56

In [47]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: funct
Statistically significant.
9.646915385622124e-24
------------------------------------
Testing for: present
Statistically significant.
1.5042393983616758e-33
------------------------------------
Testing for: quant
Statistically significant.
1.9327085581914706e-38
------------------------------------
Testing for: swear
Statistically significant.
1.2040540031469304e-27
------------------------------------
Testing for: social
Statistically significant.
1.3351643203974845e-27
------------------------------------
Testing for: affect
Statistically significant.
3.1291910585376463e-23
------------------------------------
Testing for: posemo
Statistically significant.
1.9242140100880413e-30
------------------------------------
Testing for: negemo
Statistically significant.
1.326090930345559e-37
------------------------------------
Testing for: anger
Statistically significant.
2.955644735132297e-41
------------------------------------
Testing for: verb
Statistically significant

## WhatsApp

In [48]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_religiao_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [49]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_religiao_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 359728, 'relativ': 261447, 'verb': 226949, 'affect': 203414, 'funct': 201316, 'social': 158240, 'swear': 148254, 'humans': 145659, 'posemo': 125932, 'percept': 123497, 'achieve': 119860, 'time': 117971, 'ingest': 115311, 'insight': 113557, 'motion': 112123, 'present': 104606, 'cause': 93874, 'space': 83055, 'inhib': 80753, 'bio': 73387, 'negemo': 68143, 'past': 66952, 'work': 62558, 'tentat': 61360, 'certain': 59662, 'feel': 50967, 'relig': 50332, 'incl': 47685, 'quant': 47116, 'money': 46746, 'auxverb': 45924, 'Unknown': 41715, 'discrep': 39222, 'leisure': 38527, 'anger': 36152, 'see': 32106, 'adverb': 31298, 'hear': 27558, 'pronoun': 26651, 'ipron': 24236, 'health': 23509, 'sad': 21690, 'body': 21061, 'preps': 20060, 'sexual': 17673, 'friend': 17544, 'future': 17498, 'death': 16505, 'excl': 14141, 'family': 11602, 'conj': 10904, 'number': 10470, 'anx': 9037, 'home': 8490, 'ppron': 7321, 'negate': 5927, 'we': 4880, 'nonfl': 3

In [50]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
7.542328257208883e-34
------------------------------------
Testing for: funct
Statistically significant.
9.709443040243663e-27
------------------------------------
Testing for: past
Statistically significant.
1.2422874297789254e-38
------------------------------------
Testing for: social
Statistically significant.
2.640328261223431e-30
------------------------------------
Testing for: affect
Statistically significant.
1.4285614806973799e-26
------------------------------------
Testing for: negemo
Statistically significant.
1.6203900533753187e-38
------------------------------------
Testing for: anger
Statistically significant.
1.1166768039888234e-41
------------------------------------
Testing for: verb
Statistically significant.
9.961077780853801e-25
------------------------------------
Testing for: ingest
Statistically significant.
4.32724879223847e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.7

## Comparação

In [51]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")


Testing for: funct (Telegram: 14.484917435010331, Whatsapp: 12.15435590052212, Diff: 2.3305615344882114)
Statistically significant.
p-value: 0.0
Difference: 2.3305615344882114
------------------------------------
Testing for: present (Telegram: 7.322816372422053, Whatsapp: 6.3155365362416145, Diff: 1.0072798361804383)
Statistically significant.
p-value: 3.4117010053618372e-276
Difference: 1.0072798361804383
------------------------------------
Testing for: quant (Telegram: 4.161859656189273, Whatsapp: 2.8446056578165675, Diff: 1.3172539983727058)
Statistically significant.
p-value: 0.0
Difference: 1.3172539983727058
------------------------------------
Testing for: swear (Telegram: 11.476740293395467, Whatsapp: 8.950763375370094, Diff: 2.525976918025373)
Statistically significant.
p-value: 0.0
Difference: 2.525976918025373
------------------------------------
Testing for: social (Telegram: 11.510017031821631, Whatsapp: 9.553663284083829, Diff: 1.9563537477378024)
Statistically signific

## Calculando intervalo de confiança

In [52]:

sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []
diferencas = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2
        diferencas.append(diff)

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })


# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("../resultados/resultados_liwc/comparacao_proporcoes_categorias_filtroReligioso.csv", index=False, sep=';')

menor_valor = df_resultados['ic_95_inf'].min()
maior_valor = df_resultados['ic_95_sup'].max()

print(f"Menor valor inferior dos ICs: {menor_valor:.6f}")
print(f"Maior valor superior dos ICs: {maior_valor:.6f}")

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")


Menor valor inferior dos ICs: -0.011700
Maior valor superior dos ICs: 0.045100

=== Resumo Final ===
Total de categorias comparadas: 66
Categorias com diferença significativa (p <= 0.05): 62
Categorias sem diferença significativa: 4


# Filtro religioso com exclusão de palavras politicas

## Telegram

In [53]:
palavras_politicas = [ "lula", "bolsonaro", "pt", "pl", "stf", "patria", "55", "22", "13", "senadores", "lulaladrao",
                       "urnas", "alexandre", "moraes", "comunismo", "eleicao", "eleicoes", "esquerda", "direita",
                         "presidente", "tse", "fraude", "voto", "turno", "ministro"]

In [54]:
def retirar_mensagens_com_palavras_politicas(word):
    word_lower = word.lower()
    palavras_politica_lower = [palavra.lower() for palavra in palavras_politicas]

    return any(palavra in word_lower for palavra in palavras_politica_lower)


df_politico_telegram = df_religiao_telegram[~df_religiao_telegram['text_processed'].apply(lambda x: retirar_mensagens_com_palavras_politicas(x))]



In [55]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_telegram = Counter()
total_word_count_telegram = 0

for index, row in df_politico_telegram.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_telegram += word_counts
    total_word_count_telegram += total_words

if total_word_count_telegram > 0:
    percentages_telegram = {category: (count / total_word_count_telegram) * 100 for category, count in category_counts_telegram.items()}
else:
    percentages_telegram = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_telegram)


print("\nPercentual de palavras em cada categoria:")
print(percentages_telegram)


Contagem de palavras em cada categoria:
Counter({'cogmech': 51690, 'relativ': 35042, 'verb': 34398, 'affect': 30631, 'funct': 30437, 'social': 24508, 'humans': 22397, 'swear': 21106, 'posemo': 20456, 'percept': 19041, 'relig': 17568, 'insight': 17075, 'present': 16611, 'motion': 16293, 'achieve': 16201, 'time': 16122, 'ingest': 14481, 'cause': 13033, 'inhib': 12115, 'bio': 11252, 'tentat': 11228, 'negemo': 9208, 'certain': 8928, 'past': 8815, 'auxverb': 8624, 'space': 8612, 'work': 7572, 'incl': 6593, 'feel': 6531, 'quant': 6383, 'money': 6187, 'discrep': 5801, 'hear': 5798, 'leisure': 5357, 'anger': 4990, 'adverb': 4990, 'pronoun': 4628, 'see': 4535, 'ipron': 3864, 'health': 3477, 'sexual': 3458, 'body': 2934, 'future': 2910, 'sad': 2792, 'friend': 2768, 'death': 2548, 'preps': 2153, 'excl': 1998, 'ppron': 1564, 'anx': 1520, 'family': 1422, 'conj': 1390, 'Unknown': 1169, 'negate': 1165, 'home': 940, 'we': 788, 'nonfl': 635, 'number': 584, 'i': 510, 'shehe': 453, 'assent': 415, 'filler

In [56]:

sig = []
not_sig = []

for category_name, percentage in percentages_telegram.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: relativ
Statistically significant.
3.952873106818812e-19
------------------------------------
Testing for: time
Statistically significant.
7.23422141810929e-32
------------------------------------
Testing for: funct
Statistically significant.
6.44787212499517e-22
------------------------------------
Testing for: auxverb
Statistically significant.
7.418022053353301e-38
------------------------------------
Testing for: present
Statistically significant.
1.7037473317756462e-31
------------------------------------
Testing for: future
Statistically significant.
8.846945289720321e-43
------------------------------------
Testing for: cogmech
Statistically significant.
1.0063749940214861e-10
------------------------------------
Testing for: verb
Statistically significant.
1.6564092725645282e-19
------------------------------------
Testing for: cause
Statistically significant.
2.863025980922799e-34
------------------------------------
Testing for: motion
Statistically significant.


## WhatsApp

In [57]:
# Função para verificar se uma palavra está relacionada à religião
def relacionada_religiao(word):
    word_lower = word.lower()
    palavras_religiosas_lower = [palavra.lower() for palavra in palavras_religiosas]

    # Verificando se alguma palavra da lista de palavras religiosas está presente
    return any(palavra in word_lower for palavra in palavras_religiosas_lower)


df_politico_whatsapp = df_geral_whatsapp[df_geral_whatsapp['text_processed'].apply(lambda x: relacionada_religiao(x))]



In [58]:
# Calculando contagem de palavras em categorias e total de palavras
category_counts_whatsapp = Counter()
total_word_count_whatsapp = 0

for index, row in df_politico_whatsapp.iterrows():
    word_counts, total_words = count_words_in_categories(row['text_processed'], liwc, category_names)
    category_counts_whatsapp += word_counts
    total_word_count_whatsapp += total_words

if total_word_count_whatsapp > 0:
    percentages_whatsapp = {category: (count / total_word_count_whatsapp) * 100 for category, count in category_counts_whatsapp.items()}
else:
    percentages_whatsapp = {}

print("Contagem de palavras em cada categoria:")
print(category_counts_whatsapp)


print("\nPercentual de palavras em cada categoria:")
print(percentages_whatsapp)


Contagem de palavras em cada categoria:
Counter({'cogmech': 359728, 'relativ': 261447, 'verb': 226949, 'affect': 203414, 'funct': 201316, 'social': 158240, 'swear': 148254, 'humans': 145659, 'posemo': 125932, 'percept': 123497, 'achieve': 119860, 'time': 117971, 'ingest': 115311, 'insight': 113557, 'motion': 112123, 'present': 104606, 'cause': 93874, 'space': 83055, 'inhib': 80753, 'bio': 73387, 'negemo': 68143, 'past': 66952, 'work': 62558, 'tentat': 61360, 'certain': 59662, 'feel': 50967, 'relig': 50332, 'incl': 47685, 'quant': 47116, 'money': 46746, 'auxverb': 45924, 'Unknown': 41715, 'discrep': 39222, 'leisure': 38527, 'anger': 36152, 'see': 32106, 'adverb': 31298, 'hear': 27558, 'pronoun': 26651, 'ipron': 24236, 'health': 23509, 'sad': 21690, 'body': 21061, 'preps': 20060, 'sexual': 17673, 'friend': 17544, 'future': 17498, 'death': 16505, 'excl': 14141, 'family': 11602, 'conj': 10904, 'number': 10470, 'anx': 9037, 'home': 8490, 'ppron': 7321, 'negate': 5927, 'we': 4880, 'nonfl': 3

In [59]:

sig = []
not_sig = []

for category_name, percentage in percentages_whatsapp.items():
    print("Testing for:", category_name)

    # Contagem de observações
    obs = np.array([
        [percentage, 100 - percentage],
        [100 - percentage, percentage]
    ])

    chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    if p <= 0.05:
        print("Statistically significant.")
        sig.append(category_name)
    else:
        print("No significant difference.")
        not_sig.append(category_name)

    print(p)
    print("------------------------------------")


Testing for: time
Statistically significant.
7.542328257208883e-34
------------------------------------
Testing for: funct
Statistically significant.
9.709443040243663e-27
------------------------------------
Testing for: past
Statistically significant.
1.2422874297789254e-38
------------------------------------
Testing for: social
Statistically significant.
2.640328261223431e-30
------------------------------------
Testing for: affect
Statistically significant.
1.4285614806973799e-26
------------------------------------
Testing for: negemo
Statistically significant.
1.6203900533753187e-38
------------------------------------
Testing for: anger
Statistically significant.
1.1166768039888234e-41
------------------------------------
Testing for: verb
Statistically significant.
9.961077780853801e-25
------------------------------------
Testing for: ingest
Statistically significant.
4.32724879223847e-34
------------------------------------
Testing for: relativ
Statistically significant.
3.7

## Comparação

In [60]:
# Calculando a diferença de proporções e significância estatística para categorias em comum
# Exemplo: "A proporção de palavras da categoria 'funct' é independente da plataforma (Telegram vs WhatsApp)".
sig_common = []
not_sig_common = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name]
        percentage_whatsapp = percentages_whatsapp[category_name]

        diff = abs(percentage_telegram - percentage_whatsapp)
        print(f"Testing for: {category_name} (Telegram: {percentage_telegram}, Whatsapp: {percentage_whatsapp}, Diff: {diff})")

        # Contagem de observações
        total_telegram = total_word_count_telegram * percentage_telegram / 100
        total_whatsapp = total_word_count_whatsapp * percentage_whatsapp / 100
        obs = np.array([
            [total_telegram, total_word_count_telegram - total_telegram],
            [total_whatsapp, total_word_count_whatsapp - total_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False) # comparando proporções categoria por categoria

        if p <= 0.05:
            print("Statistically significant.")
            sig_common.append(category_name)
        else:
            print("No significant difference.")
            not_sig_common.append(category_name)

        print("p-value:", p)
        print("Difference:", diff)
        print("------------------------------------")
    else:
        print(f"Category {category_name} not found in WPP dataset.")

# Exemplo de saída esperada para uma categoria
print("Categorias com diferenças estatisticamente significativas entre Telegram e WPP:")
for category in sig_common:
    print(category)

# Saídas de teste para verificar o cálculo da diferença
print(f"Differences calculated for categories: {sig_common}")
print(f"Not significant differences for categories: {not_sig_common}")

Testing for: relativ (Telegram: 18.398420682340834, Whatsapp: 15.784735873570934, Diff: 2.6136848087699)
Statistically significant.
p-value: 2.575489322382164e-190
Difference: 2.6136848087699
------------------------------------
Testing for: time (Telegram: 8.464680618706094, Whatsapp: 7.122441931791287, Diff: 1.3422386869148069)
Statistically significant.
p-value: 2.1288199179149796e-101
Difference: 1.3422386869148069
------------------------------------
Testing for: funct (Telegram: 15.980615555858913, Whatsapp: 12.15435590052212, Diff: 3.826259655336793)
Statistically significant.
p-value: 0.0
Difference: 3.826259655336793
------------------------------------
Testing for: auxverb (Telegram: 4.527937331331184, Whatsapp: 2.7726392357069374, Diff: 1.755298095624247)
Statistically significant.
p-value: 0.0
Difference: 1.755298095624247
------------------------------------
Testing for: present (Telegram: 8.721424746143587, Whatsapp: 6.3155365362416145, Diff: 2.4058882099019723)
Statistic

### Calculando Intervalo de confiança

In [61]:

sig_common = []
not_sig_common = []

z = norm.ppf(0.975)  # 95% de confiança
results = []
diferencas = []

for category_name in percentages_telegram:
    if category_name in percentages_whatsapp:
        percentage_telegram = percentages_telegram[category_name] / 100
        percentage_whatsapp = percentages_whatsapp[category_name] / 100

        n_telegram = total_word_count_telegram
        n_whatsapp = total_word_count_whatsapp

        p1 = percentage_telegram
        p2 = percentage_whatsapp
        diff = p1 - p2
        diferencas.append(diff)

        se = sqrt((p1 * (1 - p1) / n_telegram) + (p2 * (1 - p2) / n_whatsapp))
        ci_low = diff - z * se
        ci_high = diff + z * se

        count_telegram = p1 * n_telegram
        count_whatsapp = p2 * n_whatsapp

        obs = np.array([
            [count_telegram, n_telegram - count_telegram],
            [count_whatsapp, n_whatsapp - count_whatsapp]
        ])

        chi2, p, dof, ex = chi2_contingency(obs, correction=False)
        is_significant = p <= 0.05

        if is_significant:
            sig_common.append(category_name)
        else:
            not_sig_common.append(category_name)

        # Armazenar resultado
        results.append({
            "categoria": category_name,
            "proporcao_telegram": round(p1, 4),
            "proporcao_whatsapp": round(p2, 4),
            "diferenca": round(diff, 4),
            "ic_95_inf": round(ci_low, 4),
            "ic_95_sup": round(ci_high, 4),
            "p_valor": round(p, 4),
            "significativo": "Sim" if is_significant else "Não"
        })


# === Exportar para CSV ===
df_resultados = pd.DataFrame(results)
df_resultados.to_csv("../resultados/resultados_liwc/comparacao_proporcoes_categorias_filtroPolitico.csv", index=False, sep=';')

menor_valor = df_resultados['ic_95_inf'].min()
maior_valor = df_resultados['ic_95_sup'].max()

print(f"Menor valor inferior dos ICs: {menor_valor:.6f}")
print(f"Maior valor superior dos ICs: {maior_valor:.6f}")

# === Resumo no terminal ===
print("\n=== Resumo Final ===")
print(f"Total de categorias comparadas: {len(results)}")
print(f"Categorias com diferença significativa (p <= 0.05): {len(sig_common)}")
print(f"Categorias sem diferença significativa: {len(not_sig_common)}")



Menor valor inferior dos ICs: -0.019500
Maior valor superior dos ICs: 0.063200

=== Resumo Final ===
Total de categorias comparadas: 64
Categorias com diferença significativa (p <= 0.05): 62
Categorias sem diferença significativa: 2
