In [None]:
#!pip install pysentimiento
#!pip install transformers

In [1]:
import pandas as pd
import os
import re
import json
import time # to call the .sleep() method to include a pause that respects potential respect API limits
import matplotlib.pyplot as plt
import transformers

from pysentimiento import create_analyzer
from pysentimiento.preprocessing import preprocess_tweet

In [2]:
from datetime import datetime

class DateTimeEncoder(json.JSONEncoder):
    """Custom encoder for datetime objects."""
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()  # Convert datetime to ISO format string
        # Let the base class default method raise the TypeError
        return super().default(obj)

In [3]:
transformers.logging.set_verbosity(transformers.logging.ERROR)

In [4]:
sentiment_analyzer = create_analyzer(task="sentiment", lang="es")

In [5]:
hate_speech_analyzer = create_analyzer(task="hate_speech", lang="es")

In [6]:
emotion_analyzer = create_analyzer(task="emotion", lang="es")

In [None]:
#preprocess_tweet??

In [7]:
comment = "CAGASTI A LA CARCEL POR GENOCIDIO,  EL PODER JUDICIAL DEBE PRONUNCIARSE AL RESPECTO, LOS FISCALES DE OFICIO DEBEN PROCESARLO POR INEPTO, la vacuna Rusa era gratis, y no podia coimear,  por eso prefirio comprar los placebos Chinos,  porque ahi si cutrea de lo lindo,  tipo corrupto, es para arrastrarlo de las barbas  por palacio de gobierno hasta echarlo a las calles como el despojo humano que es."

In [8]:
preprocess_tweet(comment)

'CAGASTI A LA CARCEL POR GENOCIDIO,  EL PODER JUDICIAL DEBE PRONUNCIARSE AL RESPECTO, LOS FISCALES DE OFICIO DEBEN PROCESARLO POR INEPTO, la vacuna Rusa era gratis, y no podia coimear,  por eso prefirio comprar los placebos Chinos,  porque ahi si cutrea de lo lindo,  tipo corrupto, es para arrastrarlo de las barbas  por palacio de gobierno hasta echarlo a las calles como el despojo humano que es.'

In [9]:
print(sentiment_analyzer.predict(comment))
print(hate_speech_analyzer.predict(comment))
print(emotion_analyzer.predict(comment))

AnalyzerOutput(output=NEG, probas={NEG: 0.959, NEU: 0.031, POS: 0.009})
AnalyzerOutput(output=[], probas={hateful: 0.106, targeted: 0.020, aggressive: 0.086})
AnalyzerOutput(output=anger, probas={anger: 0.749, disgust: 0.243, sadness: 0.002, others: 0.002, fear: 0.002, surprise: 0.001, joy: 0.001})


In [10]:
comment_2 = "SIN DUDA ES EL MEJOR NOTICIERO"

print(sentiment_analyzer.predict(comment_2))

print(hate_speech_analyzer.predict(comment_2))

print(emotion_analyzer.predict(comment_2))


AnalyzerOutput(output=POS, probas={POS: 0.958, NEU: 0.038, NEG: 0.004})
AnalyzerOutput(output=[], probas={hateful: 0.011, targeted: 0.011, aggressive: 0.009})
AnalyzerOutput(output=joy, probas={joy: 0.517, others: 0.433, surprise: 0.044, disgust: 0.003, anger: 0.002, fear: 0.002, sadness: 0.001})


In [12]:
comment_3 = "SAGASTI Y EL MITOMANO DE VIZCARRA SON GENOSIDAS DEBEN SER FUSILADOS EN PUBLICO CUANTAS FAMILIAS LLORAN SUS MUERTOS POR QUE SE CIERRAN CON LAS VACUNAS CHINAS QUE NO SIRVE NI DESINFECTANTE."

print(sentiment_analyzer.predict(comment_3))

print(hate_speech_analyzer.predict(comment_3))

print(emotion_analyzer.predict(comment_3))

AnalyzerOutput(output=NEG, probas={NEG: 0.970, NEU: 0.025, POS: 0.005})
AnalyzerOutput(output=[], probas={hateful: 0.073, targeted: 0.007, aggressive: 0.060})
AnalyzerOutput(output=anger, probas={anger: 0.741, disgust: 0.248, sadness: 0.006, others: 0.002, fear: 0.002, joy: 0.001, surprise: 0.001})


In [None]:
'''
# Pseudocode
1. check the structure of json file to determine course of action
1.1 Each dictionary contains the comments to a video
2. open the json file in read mode
2.1 use pandas to see the info
3. iterate over it to get the information. 
4. create a new dictionary per each video with video_id, user_id, other relevant info decided in step 1.1, and results of the sentiment, emotion and hate analysis
5. Manually iterate over the comment content to see if they respond to the misinfo, and how other users reacted to said response.
'''

In [7]:
filepath = "data/willax_pbo_youtube_vids/wil_pbo_sinoph_comments.json"

In [8]:
with open(filepath, 'r') as file:
    video_comments = json.load(file)

In [9]:
def analyze_comment(comment_text):
    from pysentimiento import create_analyzer
    from pysentimiento.preprocessing import preprocess_tweet
    
    # Assuming the necessary imports are done and analyzers are created outside the function to avoid reinitialization for every call.
    processed_text = preprocess_tweet(comment_text)
    sentiment_result = sentiment_analyzer.predict(processed_text)
    hate_speech_result = hate_speech_analyzer.predict(processed_text)
    emotion_result = emotion_analyzer.predict(processed_text)

    # Return a dictionary containing the results
    return {
        'sentiment': sentiment_result,
        'hate_speech': hate_speech_result,
        'emotion': emotion_result
    }

In [None]:
#video_comments.keys()

In [10]:
# Dictionary to store analyzed comments by video ID
anlzd_vcoms_dict = {}

# Iterate through each video ID and its corresponding comments list
for video_id, comments in video_comments.items():
    # List to hold the analyzed comments for the current video
    analyzed_comments = []
    
    # Iterate through each comment dictionary in the comments list
    for comment in comments:
        # Apply your sentiment and emotion analysis on the comment text
        analysis_result = analyze_comment(comment['text'])
        # Add the analysis result to the list of analyzed comments for this video
        analyzed_comments.append(analysis_result)

    # Store the analyzed comments in the dictionary with the video_id as key
    anlzd_vcoms_dict[video_id] = analyzed_comments

# Now, 'anlzd_vcoms_dict' contains analyzed data ready for further use.


In [21]:
type(anlzd_vcoms_dict)
len(anlzd_vcoms_dict)
anlzd_vcoms_dict.keys()
anlzd_vcoms_dict.values()


dict_values([[{'sentiment': AnalyzerOutput(output=NEU, probas={NEU: 0.855, POS: 0.088, NEG: 0.058}), 'hate_speech': AnalyzerOutput(output=[], probas={hateful: 0.072, targeted: 0.016, aggressive: 0.028}), 'emotion': AnalyzerOutput(output=others, probas={others: 0.988, joy: 0.006, surprise: 0.002, anger: 0.001, sadness: 0.001, disgust: 0.001, fear: 0.001})}, {'sentiment': AnalyzerOutput(output=NEG, probas={NEG: 0.959, NEU: 0.031, POS: 0.009}), 'hate_speech': AnalyzerOutput(output=[], probas={hateful: 0.106, targeted: 0.020, aggressive: 0.086}), 'emotion': AnalyzerOutput(output=anger, probas={anger: 0.749, disgust: 0.243, sadness: 0.002, others: 0.002, fear: 0.002, surprise: 0.001, joy: 0.001})}, {'sentiment': AnalyzerOutput(output=POS, probas={POS: 0.958, NEU: 0.038, NEG: 0.004}), 'hate_speech': AnalyzerOutput(output=[], probas={hateful: 0.011, targeted: 0.011, aggressive: 0.009}), 'emotion': AnalyzerOutput(output=joy, probas={joy: 0.517, others: 0.433, surprise: 0.044, disgust: 0.003, a

In [12]:
anlzd_vcoms_file = "data/willax_pbo_youtube_vids/wil_pbo_sinoph_anlzd_vcoms.json"

with open(anlzd_vcoms_file, 'w') as file:
    json.dump(anlzd_vcoms_dict, file, cls=DateTimeEncoder, indent=4)

print(f"Data saved to {anlzd_vcoms_file}")

TypeError: Object of type AnalyzerOutput is not JSON serializable

In [None]:
df_prsd_comments = pd.DataFrame(video_comments['9K9Vpk2N38M'])
df_prsd_comments

In [None]:
from collections import defaultdict

# Dictionary to hold counts of each type found in 'values'
type_counts = defaultdict(int)
total_values_count = 0  # Counter for all values

for keys, values in video_comments.items():
    # Increment total count by the number of items in values
    total_values_count += len(values)
    for v in values:
        # Increment count for the type of v
        type_counts[type(v).__name__] += 1

# Print out the counts of each type
for type_name, count in type_counts.items():
    print(f"Found {count} instances of type {type_name}")

# Print total number of values processed
print(f"Total number of items in values: {total_values_count}")

# Optionally, print the number of non-dict items if needed
if 'dict' in type_counts:
    non_dict_count = total_values_count - type_counts['dict']
    print(f"Number of non-dict items: {non_dict_count}")


In [None]:
df_prsd_comments.tail(15)