In [1]:
!pip install scipy



In [2]:
!pip install nlpaug



In [3]:
!pip install emoji



In [4]:
# Importamos las librerías necesarias
import emoji
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from ipywidgets import  interact
from collections import Counter
import nlpaug.augmenter.word as naw

#NLTK y RE
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#GENSIM
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import Doc2Vec

#SKLEARN
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift, SpectralClustering
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import (accuracy_score, mean_absolute_error, mean_squared_error,
                             classification_report, confusion_matrix, ConfusionMatrixDisplay,
                             homogeneity_score, completeness_score, v_measure_score,
                             adjusted_rand_score, rand_score, silhouette_score,
                             precision_recall_fscore_support)
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation

# Importar las bibliotecas necesarias
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import jensenshannon
from scipy.special import kl_div

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def remove_non_alphabetic(text):
    return ''.join([char for char in text if char.isalpha() or char.isspace() or emoji.is_emoji(char)])

# Algoritmo de preprocesamiento
def preprocess_text(df):

    stop_words = set(stopwords.words('english'))

    # Convertir los textos a minúsculas
    df["text"] = df["text"].str.lower()

    # Eliminar caracteres no alfabéticos, manteniendo emojis
    df["text"] = df["text"].apply(remove_non_alphabetic)

    # Eliminar stop-words
    df["text"] = df["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    return df

# Aplicar el preprocesamiento al DataFrame
#df = preprocess_text(df)

# Inicializar Stemmer y Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Función para hacer stemming y lematización
def stem_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    stemmed = [stemmer.stem(token) for token in tokens]
    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed]
    return ' '.join(lemmatized)

# Aplicar stemming y lematización al DataFrame
#df["text"] = df["text"].apply(stem_and_lemmatize)

In [6]:
# normalizar los textos

# TfidfVectorizer
vectorizer_TFIDF = TfidfVectorizer(max_df=0.3,
                             min_df=8,
                             stop_words="english",
                             lowercase=True,
                             use_idf=True,
                             norm="l2",
                             smooth_idf=True
                                   )

#CountVectorizer
vectorizer_CVec = CountVectorizer(strip_accents = "ascii",
                                  lowercase = True,
                                  stop_words = "english",
                                  max_df = 0.3)

#HashingVectorizer
vectorizer_Hashing = HashingVectorizer(lowercase = True,
                                       stop_words = "english",
                                       norm = "l2"
                                       )

#Comparación Vectorizacion Corpus NO Balanceado vs Corpus Balanceado

En esta seccion se examinará el comportamiento de los corpus de texto aumentados. Para ello, se emplearán dos métodos basados en la distancia: la distancia de Jaccard y el coseno de similitud. Estos métodos permitirán evaluar la similitud entre los conjuntos de palabras y los vectores que representan los textos originales y sus versiones aumentadas.

-**Coseno de Similitud:**

 El coseno de similitud mide la similitud entre dos vectores que representan los textos. Cada vector se compone de la frecuencia de aparición de las palabras en el texto correspondiente. El coseno de similitud varía entre 0 y 1, donde 0 indica que los vectores son ortogonales (no tienen palabras en común) y 1 indica que son idénticos. Un valor de coseno de similitud cercano a 1 indica una alta similitud entre los textos, mientras que un valor cercano a 0 indica una baja similitud.

### Para SEXISMO

In [7]:
#NOT BALANCED
df_NOTbalanced = pd.read_csv("/content/drive/MyDrive/NLP/Corpus/df_sexista.csv")
df_NOTbalanced = df_NOTbalanced.rename(columns = {"sexist":"label"})

In [8]:
### BALANCED
df = pd.read_csv("/content/drive/MyDrive/NLP/Corpus/sexism_data.csv")
df = df.drop(columns=["toxicity","id", "dataset", "of_id"]) #no son nenecsarias
df["sexist"].replace({True:1, False:0},inplace=True)#reemplazamos 1 = True, 0=false
df = df.rename(columns = {"sexist":"label"})

#Data Augmentation
# Calculamos el número de augmentaciones necesarias
num_majority = df['label'].value_counts()[0]
num_minority = df['label'].value_counts()[1]
num_augmentations = num_majority - num_minority

#Data Augmentation
# Usamos nlpaug para hacer data augmentation en la columna 'text'
aug = naw.SynonymAug(aug_src='wordnet')

def augment_text(text, augmenter, n_augmentations=1):
    augmented_texts = [augmenter.augment(text) for _ in range(n_augmentations)]
    return augmented_texts

# Aplicamos augmentación a los textos etiquetados como 'Hate'
df_label_balanced = df[df['label'] == 1]
augmented_texts = []

for text in df_label_balanced['text']:
    augmented_texts.extend(augment_text(text, aug, n_augmentations=int(num_augmentations/num_minority)))

# Creamos un DataFrame con los textos aumentados
df_augmented = pd.DataFrame(augmented_texts, columns=['text'])
df_augmented['label'] = 1

# Concatenar los textos aumentados con el dataset original
df_balanced = pd.concat([df, df_augmented])

#limpiamos texto para cubrirnos en salud y aplicamos lematizado y stemming
df_balanced = preprocess_text(df_balanced)
# Función para hacer stemming y lematización
df_balanced['Lematizado_Stemming'] = df_balanced['text'].apply(stem_and_lemmatize)
df_balanced.head()

Unnamed: 0,text,label,Lematizado_Stemming
0,mention didnt even know random option,0,mention didnt even know random option
1,bottom two shouldve gone mkr,0,bottom two shouldv gone mkr
2,mention mention ladyboner deserves much credit...,0,mention mention ladybon deserv much credit dud...
3,shall known sourpuss mkr katandre failedfoodies,0,shall known sourpuss mkr katandr failedfoodi
4,tarah w threw bunch women bus could get wadhwa...,0,tarah w threw bunch woman bu could get wadhwa ...


In [10]:
def calculate_metrics(matrix1, matrix2):
   flat1 = matrix1.flatten()
   flat2 = matrix2.flatten()
   js_distance = jensenshannon(flat1, flat2)
   kl_divergence = np.sum(kl_div(flat1, flat2))
   return js_distance, kl_divergence

In [10]:
#No Balanceado
NObalanced_VecTFIDF = vectorizer_TFIDF.fit_transform(df_NOTbalanced["Lematizado_Stemming"])
NObalanced_CountVec = vectorizer_CVec.fit_transform(df_NOTbalanced["Lematizado_Stemming"])
NObalanced_HashingVec = vectorizer_Hashing.fit_transform(df_NOTbalanced["Lematizado_Stemming"])

#Balanceado
balanced_VecTFIDF = vectorizer_TFIDF.fit_transform(df_balanced["Lematizado_Stemming"])
balanced_CountVec = vectorizer_CVec.fit_transform(df_balanced["Lematizado_Stemming"])
balanced_HashingVec = vectorizer_Hashing.fit_transform(df_balanced["Lematizado_Stemming"])

In [11]:
#VecTFIDF
cosine_matrix_NOTbalanced = cosine_similarity(NObalanced_VecTFIDF)
cosine_matrix_balanced = cosine_similarity(balanced_VecTFIDF)
#js_cosine, kl_cosine = calculate_metrics(cosine_matrix_NOTbalanced, cosine_matrix_balanced)
#print(f"Jensen-Shannon Distance (Cosine): {js_cosine}")
#print(f"Kullback-Leibler Divergence (Cosine): {kl_cosine}")
#print(f"Similaridad del coseno entre posts: {cosine_matrix_balanced[4][3]}")
#print(f"Similaridad del coseno entre posts: {cosine_matrix_NOTbalanced[4][3]}")

In [13]:
cosine_matrix_NOTbalanced.shape

(13631, 13631)

In [15]:
cosine_matrix_balanced.shape

(22676, 22676)

In [None]:
print("Forma cosine_matrix_NOTbalanced: ", cosine_matrix_NOTbalanced.shape)

In [13]:
# Vectorización usando TF-IDF
tfidf_balanceado = balanced_VecTFIDF
tfidf_no_balanceado = NObalanced_VecTFIDF

# Paso 3: Cálculo de matrices de similitud coseno
sim_matrix_balanceado = cosine_similarity(tfidf_balanceado)
sim_matrix_no_balanceado = cosine_similarity(tfidf_no_balanceado)

# Paso 4: Comparación de las matrices
# Para comparar, utilizaremos la métrica de diferencia promedio
def comparar_matrices(matrix1, matrix2):
    # Asegúrate de que ambas matrices sean del mismo tamaño
    if matrix1.shape != matrix2.shape:
        raise ValueError("Las matrices deben tener el mismo tamaño para ser comparadas")

    diferencia = np.abs(matrix1 - matrix2)
    return np.mean(diferencia)

diferencia_promedio = comparar_matrices(sim_matrix_balanceado, sim_matrix_no_balanceado)
print(f"Diferencia promedio entre las matrices de similitud coseno: {diferencia_promedio}")

# Visualización de las matrices de similitud
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(sim_matrix_balanceado, ax=axes[0], cmap='viridis', annot=True, fmt='.2f')
axes[0].set_title('Similitud Coseno - Corpus Balanceado')
sns.heatmap(sim_matrix_no_balanceado, ax=axes[1], cmap='viridis', annot=True, fmt='.2f')
axes[1].set_title('Similitud Coseno - Corpus No Balanceado')
plt.show()

ValueError: Las matrices deben tener el mismo tamaño para ser comparadas