## Análisis exploratorio

### Preprocesado común

In [2]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd # read the csv
import re # regex to detect username, url, html entity
import nltk # to use word tokenize (split the sentence into words)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.model_selection import train_test_split
import kagglehub
import os
import numpy as np
import gc
import json
from sklearn.utils.class_weight import compute_class_weight
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

# Inicializar el lematizador
lemmatizer = WordNetLemmatizer()

# Borrar entidades HTML
def remove_entity(raw_text):
    entity_regex = r"&[^\s;]+;"
    return re.sub(entity_regex, "", raw_text)

# Reemplazar menciones por 'user'
def change_user(raw_text):
    regex = r"@([^ ]+)"
    return re.sub(regex, "user", raw_text)

def remove_url(raw_text):
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(url_regex, '', raw_text)
    # Eliminar posibles instancias de 'href', 'http', o 'www' restantes
    text = re.sub(r'\b(http|www|href)\b', '', text)
    return text

# Borrar signos de puntuación
def remove_noise_symbols(raw_text):
    text = raw_text.replace('"', '').replace("'", '').replace("!", '').replace("`", '').replace("..", '')
    return re.sub(r'[^\w\s]', '', text)

# Borrar "RT"
def remove_rt(raw_text):
    return re.sub(r'\bRT\b', '', raw_text, flags=re.IGNORECASE)

# Borrar "user"
def remove_user(raw_text):
    return re.sub(r'\buser\b', '', raw_text, flags=re.IGNORECASE)

# Preprocesamiento principal para un solo texto
def preprocess(text):
    text = change_user(text)
    text = remove_entity(text)
    text = remove_url(text)
    text = remove_noise_symbols(text)
    text = remove_rt(text)
    text = remove_user(text)
    return text.lower()

# Tokenizar sin lematizar
def tokenize_text(text):
    if not isinstance(text, str):
        return []
    text = preprocess(text)
    tokens = word_tokenize(text)
    return tokens

# Lematización por separado
def lemmatize_tokens(tokens, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in tokens]

def process_text_column(df, text_column, lemmatizer=None):
    df['tokens'] = df[text_column].apply(tokenize_text)
    if lemmatizer is not None:
        df['tokens'] = df['tokens'].apply(lambda tokens: lemmatize_tokens(tokens, lemmatizer))
    return df

def eliminar_stopwords(df, column_name):
    # Aplicar eliminación de stopwords a la columna especificada
    df[column_name] = df[column_name].apply(lambda tokens: [word for word in tokens if word not in stop_words])
    return df

# función para calcular el porcentaje de stopwords en un texto
def stopword_percentage(tokens):
    stopword_count = sum(1 for word in tokens if word in stop_words)
    stopword_pct = (stopword_count / len(tokens)) * 100
    return stopword_pct

# Función para limpiar, tokenizar y lematizar
def process_text_column2(df, text_column):
    # Función interna para limpiar, tokenizar y lematizar

    # Aplicar la tokenización a la columna de texto
    df['stopword_percentage'] = df[text_column].apply(stopword_percentage)
    return df

def graphics_stopwords(df, text_column):

    stopwords_by_rating = df.groupby(text_column)['stopword_percentage'].mean()
    meaningful_by_rating = 100 - stopwords_by_rating

    plt.figure(figsize=(8, 6))
    bar1 = plt.bar(stopwords_by_rating.index, stopwords_by_rating, color='#2C77A8', label='Stopwords')
    bar2 = plt.bar(meaningful_by_rating.index, meaningful_by_rating, bottom=stopwords_by_rating, color='#87CEEB', label='Palabras con significado')

    plt.xlabel(text_column)
    plt.ylabel('Porcentaje')
    plt.title('Porcentaje de stopwords vs. con significado')
    plt.xticks(stopwords_by_rating.index)
    plt.legend()
    plt.show()



def graficar_longitud_textos(df, text_column, rating_column):
    df['NP'] = df[text_column].apply(len)

    ratings = sorted(df[rating_column].unique())
    num_ratings = len(ratings)
    num_cols = min(3, num_ratings + 1)
    num_rows = (num_ratings + 1 + num_cols - 1) // num_cols

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(4 * num_cols, 4 * num_rows))
    axes = np.ravel(axes)

    for i, rating in enumerate(ratings):
        ax = axes[i]
        subset = df[df[rating_column] == rating]
        media_np = subset['NP'].mean()
        max_np = subset['NP'].max()

        ax.hist(subset['NP'], bins=30, edgecolor='black', alpha=0.7, color='#2C77A8')
        ax.set_xlim(0, max_np + 5)  # Añadimos un margen para mejor visualización
        ax.set_xlabel('Número de palabras')
        ax.set_ylabel('Frecuencia')
        ax.set_title(f'Rating {rating}')

        ax.text(max_np * 0.5, ax.get_ylim()[1] * 0.8, f'Media: {media_np:.1f}', fontsize=12, color='red')

    # Gráfico global al final
    global_ax = axes[num_ratings]
    media_global = df['NP'].mean()
    max_np_global = df['NP'].max()

    global_ax.hist(df['NP'], bins=30, edgecolor='black', alpha=0.7, color='#2C77A8')
    global_ax.set_xlim(0, max_np_global + 5)
    global_ax.set_xlabel('Número de palabras')
    global_ax.set_ylabel('Frecuencia')
    global_ax.set_title('Distribución Global')
    global_ax.text(max_np_global * 0.3, global_ax.get_ylim()[1] * 0.8, f'Media: {media_global:.1f}', fontsize=12, color='red')

    # Eliminar subplots vacíos si hay
    for j in range(num_ratings + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()




def wordclouds(df, column_partition, top_n):

    # Obtener lista de todas las palabras en el dataset
    all_words = [word for tokens in df['tokens'] for word in tokens]

    # Contar frecuencia de palabras
    word_freq = Counter(all_words)

    # Obtener las `top_n` palabras más comunes
    top_words = {word for word, _ in word_freq.most_common(top_n)}

    # Filtrar los tokens eliminando las palabras más comunes
    df['filtered_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in top_words])

    # Generar WordCloud por cada rating
    ratings = sorted(df[column_partition].unique())
    fig, axes = plt.subplots(1, len(ratings), figsize=(20, 5))

    for i, rating in enumerate(ratings):
        text = ' '.join([' '.join(tokens) for tokens in df[df[column_partition] == rating]['filtered_tokens']])
        wordcloud = WordCloud(width=400, height=400, background_color='white').generate(text)

        # Mostrar en subplot
        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].axis('off')
        axes[i].set_title(f'{rating}')

    plt.show()


# Función para crear gráficos de barras horizontales con las palabras más importantes por Rating
def tfidf(df, text_column, column_partition):

    # Agrupar textos por rating
    grouped_reviews = df.groupby(column_partition)[text_column].apply(lambda x: " ".join([" ".join(tokens) for tokens in x]))

    # Vectorizar con TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(grouped_reviews)

    # Convertir a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=grouped_reviews.index, columns=vectorizer.get_feature_names_out())

    # Seleccionar las palabras con mayor TF-IDF en cada rating
    top_terms_per_rating = {}
    for rating in tfidf_df.index:
        top_terms = tfidf_df.loc[rating].nlargest(5)  # Tomar las 5 palabras con mayor TF-IDF
        top_terms_per_rating[rating] = top_terms

    # Graficar los resultados
    plt.figure(figsize=(10, 6))

    for i, (rating, terms) in enumerate(top_terms_per_rating.items(), 1):
        plt.subplot(2, 3, i)  # Ajustar el número de filas y columnas según la cantidad de ratings
        terms.plot(kind='barh', color='royalblue', ax=plt.gca())  # Graficar las top terms como barras horizontales
        plt.title(f'Top TF-IDF - {rating}')
        plt.xlabel('TF-IDF')
        plt.ylabel('Palabras')
        plt.gca().invert_yaxis()  # Para que la barra más alta esté arriba

    plt.tight_layout()  # Ajustar el espacio entre los subgráficos
    plt.show()


# Función para obtener n-gramas
def get_ngrams(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

# Función para calcular y graficar bigramas y trigramas más comunes por clases
def plot_most_common_ngrams_by_class(df, text_column, class_column):
    # Crear subgráficos para bigramas
    num_classes = len(df[class_column].unique())
    ncols = 3  # Número de columnas de subgráficos
    nrows = (num_classes + ncols - 1) // ncols  # Calcular número de filas necesarias

    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 5 * nrows))
    axes = axes.flatten()  # Aplanar la matriz de ejes para facilitar el acceso

    # Generar bigramas
    for i, class_value in enumerate(df[class_column].unique()):
        # Filtrar los datos por clase
        subset = df[df[class_column] == class_value]

        # Obtener todos los tokens del subset filtrado
        all_words = [word for tokens in subset[text_column] for word in tokens]

        # Calcular bigramas
        bigram_freq = Counter(get_ngrams(all_words, 2))

        # Obtener los 10 bigramas más comunes
        bigram_most_common = bigram_freq.most_common(10)

        # Bigramas
        bigrams, bigram_counts = zip(*bigram_most_common)
        axes[i].barh(bigrams, bigram_counts, color='#a9dfd0', alpha=0.7)

        # Configuración de los gráficos
        axes[i].set_title(f'Bigramas más comunes en {class_value}')
        axes[i].set_xlabel('Frecuencia')
        axes[i].set_ylabel('Bigramas')

    # Ajustar espacio entre subgráficos y mostrar el plot de bigramas
    plt.tight_layout()
    plt.show()

    # Crear subgráficos para trigramas
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 5 * nrows))
    axes = axes.flatten()  # Aplanar la matriz de ejes para facilitar el acceso

    # Generar trigramas
    for i, class_value in enumerate(df[class_column].unique()):
        # Filtrar los datos por clase
        subset = df[df[class_column] == class_value]

        # Obtener todos los tokens del subset filtrado
        all_words = [word for tokens in subset[text_column] for word in tokens]

        # Calcular trigramas
        trigram_freq = Counter(get_ngrams(all_words, 3))

        # Obtener los 10 trigramas más comunes
        trigram_most_common = trigram_freq.most_common(10)

        # Trigramas
        trigrams, trigram_counts = zip(*trigram_most_common)
        axes[i].barh(trigrams, trigram_counts, color='#2C77A8', alpha=0.7)

        # Configuración de los gráficos
        axes[i].set_title(f'Trigramas más comunes en {class_value}')
        axes[i].set_xlabel('Frecuencia')
        axes[i].set_ylabel('Trigramas')

    # Ajustar espacio entre subgráficos y mostrar el plot de trigramas
    plt.tight_layout()
    plt.show()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
df = pd.read_csv()

In [11]:
process_text_column2(df, 'text')
graphics_stopwords(df, 'label')

NameError: name 'df' is not defined