# Importar datos MFRC

In [1]:
#!pip install huggingface_hub
#!pip install vaderSentiment
#!pip install pandas
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Cargar los datos de Reddit
df = pd.read_csv("hf://datasets/USC-MOLA-Lab/MFRC/final_mfrc_data.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Utilizar explode para desglosar las anotaciones separadas por comas en filas individuales
df = df.assign(annotation=df['annotation'].str.split(',')).explode('annotation')
# Reiniciar el índice
df = df.reset_index(drop=True)
# Frecuencia
df['annotation'].value_counts()

annotation
Non-Moral          30774
Thin Morality      11123
Care                7761
Equality            5105
Authority           4807
Proportionality     4593
Loyalty             2794
Purity              2171
Name: count, dtype: int64

In [3]:
category_mapping = {
    'Non-Moral': 'non-moral',
    'Thin Morality': None,  # Eliminar Thin Morality
    'Care': 'care',
    'Equality': 'fairness',
    'Authority': 'authority',
    'Proportionality': 'fairness',
    'Loyalty': 'loyalty',
    'Purity': 'purity'
}

# Aplicar el mapeo de categorías
df['annotation'] = df['annotation'].map(category_mapping)

# Eliminar las filas con anotaciones None (que corresponde a Thin Morality)
df = df[df['annotation'].notna()]

# Inicializar VADER
analyzer = SentimentIntensityAnalyzer()

# Función para asignar puntajes de sentimiento y determinar la moral foundation con polaridad
def map_categories_with_polarity(text, category):
    scores = analyzer.polarity_scores(text)
    if category in ['care', 'harm', 'fairness', 'cheating', 'loyalty', 'betrayal', 'authority', 'subversion', 'purity', 'degradation']:
        if scores['compound'] >= 0:
            return category
        else:
            # Añadir "vice" o "virtue" dependiendo de la polaridad
            if category == 'care':
                return 'harm'
            elif category == 'fairness':
                return 'cheating'
            elif category == 'loyalty':
                return 'betrayal'
            elif category == 'authority':
                return 'subversion'
            elif category == 'purity':
                return 'degradation'
            else:
                return category
    else:
        return 'non-moral'

# Aplicar la función al dataframe de Reddit
df['mapped_category'] = df.apply(lambda row: map_categories_with_polarity(row['text'], row['annotation']), axis=1)

# Filtrar solo las categorías especificadas
desired_categories = ['non-moral', 'care', 'harm', 'fairness', 'cheating', 'loyalty', 'betrayal', 'authority', 'subversion', 'purity', 'degradation']
df_filtered = df[df['mapped_category'].isin(desired_categories)]

# Frecuencia de anotaciones mapeadas
annotation_counts = df_filtered['mapped_category'].value_counts()
print("Frecuencia de anotaciones mapeadas:")
print(annotation_counts)

# Frecuencia de sentimientos
sentiment_counts = df_filtered['mapped_category'].value_counts()
print("\nFrecuencia de sentimientos:")
print(sentiment_counts)


Frecuencia de anotaciones mapeadas:
mapped_category
non-moral      30774
cheating        5131
fairness        4567
harm            4468
care            3293
subversion      2465
authority       2342
loyalty         1612
degradation     1406
betrayal        1182
purity           765
Name: count, dtype: int64

Frecuencia de sentimientos:
mapped_category
non-moral      30774
cheating        5131
fairness        4567
harm            4468
care            3293
subversion      2465
authority       2342
loyalty         1612
degradation     1406
betrayal        1182
purity           765
Name: count, dtype: int64


In [4]:
MFR = df_filtered
MFR.head()

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence,mapped_category
0,That particular part of the debate is especial...,europe,French politics,annotator03,non-moral,Confident,non-moral
1,That particular part of the debate is especial...,europe,French politics,annotator01,purity,Confident,degradation
3,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,non-moral,Confident,non-moral
4,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator00,non-moral,Somewhat Confident,non-moral
5,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator02,non-moral,Confident,non-moral


In [5]:
# Crear la tabla de contingencia
contingency_table = pd.crosstab(MFR['mapped_category'], MFR['annotation'])

# Mostrar la tabla de contingencia
print("\nTabla de contingencia:")
print(contingency_table)

# Frecuencia de anotaciones
annotation_counts = MFR['mapped_category'].value_counts()
print(annotation_counts)


Tabla de contingencia:
annotation       authority  care  fairness  loyalty  non-moral  purity
mapped_category                                                       
authority             2342     0         0        0          0       0
betrayal                 0     0         0     1182          0       0
care                     0  3293         0        0          0       0
cheating                 0     0      5131        0          0       0
degradation              0     0         0        0          0    1406
fairness                 0     0      4567        0          0       0
harm                     0  4468         0        0          0       0
loyalty                  0     0         0     1612          0       0
non-moral                0     0         0        0      30774       0
purity                   0     0         0        0          0     765
subversion            2465     0         0        0          0       0
mapped_category
non-moral      30774
cheating        

In [6]:
# Asumiendo que MFR es tu DataFrame original¿
MFR.drop(MFR.columns[4], axis=1, inplace=True)
MFR.rename(columns={'mapped_category': 'annotation'}, inplace=True)


In [7]:
MFR 

Unnamed: 0,text,subreddit,bucket,annotator,confidence,annotation
0,That particular part of the debate is especial...,europe,French politics,annotator03,Confident,non-moral
1,That particular part of the debate is especial...,europe,French politics,annotator01,Confident,degradation
3,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,Confident,non-moral
4,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator00,Somewhat Confident,non-moral
5,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator02,Confident,non-moral
...,...,...,...,...,...,...
69120,Youre allowed to seek attention. Maybe someth...,confession,Everyday Morality,annotator05,Somewhat Confident,fairness
69122,"Yeeeaah, if you take him back he's still gonna...",relationship_advice,Everyday Morality,annotator05,Confident,loyalty
69123,Well I can discern from your vehemence toward ...,AmItheAsshole,Everyday Morality,annotator05,Confident,cheating
69126,Yes. Disordered eating is insidious. And Rita ...,AmItheAsshole,Everyday Morality,annotator05,Somewhat Confident,non-moral


In [8]:
filtered_df = MFR[MFR["confidence"] == "Confident"]

In [10]:
filtered_df["annotation"].value_counts()

annotation
non-moral      24190
cheating        4018
fairness        3448
harm            3414
care            2469
subversion      1493
authority       1415
loyalty         1128
degradation     1113
betrayal         815
purity           566
Name: count, dtype: int64

In [15]:
filtered_df['annotation_binary'] = filtered_df['annotation'].apply(lambda x: 1 if x == 'non-moral' else 0)

# Display the updated dataframe with the new column
updated_df = filtered_df[["text", "annotation_binary"]]
updated_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['annotation_binary'] = filtered_df['annotation'].apply(lambda x: 1 if x == 'non-moral' else 0)


Unnamed: 0,text,annotation_binary
0,That particular part of the debate is especial...,1
1,That particular part of the debate is especial...,0
3,"/r/france is pretty lively, with it's own ling...",1
5,"/r/france is pretty lively, with it's own ling...",1
9,it really is a very unusual situation isn't it...,1
...,...,...
69115,Of course. Not wanting it doesn’t make it any ...,1
69116,"At least you feel guilty, let’s you know you’r...",0
69119,Why are you dating this guy? Dude seems like a...,0
69122,"Yeeeaah, if you take him back he's still gonna...",0


In [17]:
deduplicated_df = updated_df.drop_duplicates()

In [18]:
duplicate_texts = deduplicated_df[deduplicated_df.duplicated(subset='text', keep=False)]
duplicate_texts

Unnamed: 0,text,annotation_binary
0,That particular part of the debate is especial...,1
1,That particular part of the debate is especial...,0
32,Wow did not know all that! Maybe got some sour...,1
34,Wow did not know all that! Maybe got some sour...,0
49,Melenchon's party said they'll hold an interna...,1
...,...,...
69072,"Yeah, this, I don't get why OP turned into the...",0
69076,She is still showing that she can bully you. I...,0
69103,You’re very lucky no one snitched on you. I ha...,0
69105,It did there is proof that it did if you go ba...,0


In [22]:
# Group by the 'text' column and count the occurrences of each text
duplicate_text_counts = deduplicated_df['text'].value_counts().reset_index(name='count')

# Sort by count in descending order
sorted_duplicate_text_counts = duplicate_text_counts.sort_values(by='count', ascending=False)

sorted_duplicate_text_counts[sorted_duplicate_text_counts["count"] == 2]

Unnamed: 0,text,count
0,That particular part of the debate is especial...,2
2608,Le Pen has distanced herself from daddy becaus...,2
2580,"Man, who would've thought that a foreign intel...",2
2581,As much as the idea of a refuged goth nobility...,2
2582,"Sadly, I think most of the Second Amendment cr...",2
...,...,...
1297,No. The word voter does not even appear. It do...,2
1282,The left has nothing but contempt for those th...,2
1295,I didn't hear anybody calling Le Pen anything ...,2
1294,"As an American, thank goodness Fillon is likel...",2


In [24]:
unique_texts_df = deduplicated_df[deduplicated_df['text'].map(deduplicated_df['text'].value_counts()) == 1]


text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    annotation_binary
\n\nIf you prefer not to click on Daily Mail sources, then [here is a screenshot of the original article.](https://vgy.me/BnGnQM)\n\n\nI've also autogenerated some potential alternative sources to this stor

In [25]:
binary_counts = unique_texts_df['annotation_binary'].value_counts()
binary_counts

annotation_binary
1    8674
0    4406
Name: count, dtype: int64

In [26]:
unique_texts_df[unique_texts_df['text'].map(unique_texts_df['text'].value_counts()) == 1]


Unnamed: 0,text,annotation_binary
3,"/r/france is pretty lively, with it's own ling...",1
9,it really is a very unusual situation isn't it...,1
14,The Le Pen brand of conservatism and classical...,1
15,"Macrons face just screams\n""I do not know her,...",1
18,"Clinton lead polls by 4%, well within a reason...",1
...,...,...
68971,"You’re just as bad, if he didn’t cheat you’re ...",0
69014,Sexual harassment isn’t a “type of humor”. It’...,0
69025,You absolutely did the right thing! You’re not...,0
69028,"&gt;""Britain, offensive slang, derogatory"" ac...",0


In [36]:
from transformers import MarianMTModel, MarianTokenizer

# Load the MarianMT model for English to Spanish translation
model_name = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [28]:
def translate_in_batches(text_list, batch_size=10):
    translations = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        translated_batch = model.generate(**tokenizer(batch, return_tensors="pt", padding=True))
        translations.extend([tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch])
    return translations

# Translate the texts in smaller batches
translated_texts = translate_in_batches(unique_texts_df['text'].tolist(), batch_size=5)


In [29]:
translated_texts

['/r/france es bastante animado, con su propia jerga que suele ser deliberadamente malas traducciones - shitpost es "cacapoteau" o "compost", crosspost es "croixpoteau", etc Hay una mezcla de memes, política, noticias internacionales, problemas personales, etc. El tiempo de las elecciones fue bastante entretenido, con la gente de /r/the_doofus viene sobre avd chelín para Le Pen, por lo general muy mal.',
 'Es una situación muy inusual, pero Fillon también influye en los votantes, ambos candidatos más débiles de lo habitual.',
 'La marca Le Pen del conservadurismo y el conservadurismo clásico de derecha son casi completamente diferentes. La derecha tradicionalmente defiende la fe y los valores familiares, la libertad económica y el gobierno pequeño, que son los valores que Fillon parece promover. Le Pen es más autoritaria y estatista en general, mientras que también es pro aborto y matrimonio gay, todo lo contrario en muchos temas. Por mucho que la gente se estremezca ante la mención de

In [30]:
!pip install pandas openpyxl



In [32]:
unique_texts_df['translated_text'] = translated_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_texts_df['translated_text'] = translated_texts


In [33]:
# Create the final DataFrame with 'annotation_binary', 'text', and 'translated_text'
final_df = unique_texts_df[['annotation_binary', 'text', 'translated_text']]

In [34]:
final_df

Unnamed: 0,annotation_binary,text,translated_text
3,1,"/r/france is pretty lively, with it's own ling...","/r/france es bastante animado, con su propia j..."
9,1,it really is a very unusual situation isn't it...,"Es una situación muy inusual, pero Fillon tamb..."
14,1,The Le Pen brand of conservatism and classical...,La marca Le Pen del conservadurismo y el conse...
15,1,"Macrons face just screams\n""I do not know her,...","La cara de Macrons simplemente grita ""¡No la c..."
18,1,"Clinton lead polls by 4%, well within a reason...","Clinton lidera las encuestas en un 4%, bien de..."
...,...,...,...
68971,0,"You’re just as bad, if he didn’t cheat you’re ...","Eres igual de malo, si él no te engañó eres aú..."
69014,0,Sexual harassment isn’t a “type of humor”. It’...,El acoso sexual no es un “tipo de humor”. Es a...
69025,0,You absolutely did the right thing! You’re not...,¡Hiciste absolutamente lo correcto! No estás l...
69028,0,"&gt;""Britain, offensive slang, derogatory"" ac...","&gt; ""Gran Bretaña, argot ofensivo, despectivo..."


In [35]:
output_file = "translated_texts_with_binary_fixed.xlsx"
final_df.to_excel(output_file, index=False)