In [None]:
import pandas as pd
import string

In [None]:
df = pd.read_excel("/content/Comentarios_consolidado_final_revisadoMECV.xlsx")
df.head()

In [None]:
df.info()

In [None]:
df = df.dropna()
df.info()

In [None]:
df = df[df['COMENTARIOS'].str.len() >= 4]
df.info()

In [None]:
df["Categorización"].unique()

In [None]:
df['Categorización'] = df['Categorización'].replace({'MIXTO ': 'MIXTO', 'BUENO ': 'BUENO'})
df["Categorización"].unique()

In [None]:
df = df[df['Categorización'] != "NO APLICA"]
df.info()
df["Categorización"].unique()

In [None]:
df["Categorización"].value_counts()

In [None]:
# Dropping rows randomly
value_to_drop_mixto = 'MIXTO'
number_of_rows_to_drop_mixto = 41
value_to_drop_malo = 'MALO'
number_of_rows_to_drop_malo = 35

# Selecting random indices of rows with the specific value
indices_to_drop_mixto = df[df['Categorización'] == value_to_drop_mixto].sample(number_of_rows_to_drop_mixto).index
indices_to_drop_malo = df[df['Categorización'] == value_to_drop_malo].sample(number_of_rows_to_drop_malo).index

# Dropping the selected rows
df = df.drop(indices_to_drop_mixto)
df = df.drop(indices_to_drop_malo)

df["Categorización"].value_counts()

## Lowercase

In [None]:
df['COMENTARIOS_Lower'] = df['COMENTARIOS'].str.lower()
df.head(10)

## Punctuation signs

In [None]:
def remove_punctuation(text):
  punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
  spaces = ' ' * len(punctuation)
  return text.translate(str.maketrans(punctuation, spaces))

df['Comentarios_signos']= df['COMENTARIOS_Lower'].apply(lambda x:remove_punctuation(x))
df.head(10)

## Stopwords

In [None]:
pip install stop_words

In [None]:
from stop_words import get_stop_words

def remove_stopwords(text):
  stopwords = get_stop_words('es')
  return " ".join([word for word in text.split() if word not in stopwords])

df['Comentarios_stop']= df['Comentarios_signos'].apply(lambda x:remove_stopwords(x))
df.head()

## Accents

In [None]:
def replaceAccents(word):
  word = word.replace('í','i')
  word = word.replace('ó','o')
  word = word.replace('ò','o')
  word = word.replace('ñ','n')
  word = word.replace('é','e')
  word = word.replace('è','e')
  word = word.replace('á','a')
  word = word.replace('à','a')
  word = word.replace('ü','u')
  word = word.replace('ú','u')
  word = word.replace('ö','o')
  word = word.replace('ë','e')
  word = word.replace('ï','i')
  return word

df['Comentarios_tildes']= df['Comentarios_stop'].apply(lambda x:replaceAccents(x))
df.head()

## Abreviations

In [None]:
def processDetails(word):
	word = word.replace(' x ',' por ')
	word = word.replace(' q ', ' que ')
	word = word.replace(' k ', ' que ')
	return word

df['Comentarios_abrev']= df['Comentarios_tildes'].apply(lambda x:processDetails(x))
df.head()

## Remove letters repeated more than 3 times

In [None]:
import re

In [None]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{3,}")
    return pattern.sub(r"\1\1\1", text)

df['Comentarios_red']= df['Comentarios_abrev'].apply(lambda x:reduce_lengthening(x))
df.head()

## Drop numbers and more than one gap in the sentence

In [None]:
numbers = re.compile(r'[0-9]+')

def clean(text):
    text = numbers.sub(r'', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

df['Comentarios_limpios']= df['Comentarios_red'].apply(lambda x:processDetails(x))
df.head()

In [None]:
df = df[["COMENTARIOS", "Comentarios_signos", "Categorización", "Comentarios_limpios"]]
df.head()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming 'data' is your dataset and 'categories' is the column containing categories
# Step 1: Identify the categories
unique_categories = df['Categorización'].unique()

# Step 2: Split Data by Category
category_data = {}
for category in unique_categories:
    category_data[category] = df[df['Categorización'] == category]

# Step 3: Shuffle the Data
for category in unique_categories:
    category_data[category] = category_data[category].sample(frac=1).reset_index(drop=True)

# Step 4: Determine Split Ratios
train_ratio = 0.6
validation_ratio = 0.2
test_ratio = 0.2

# Step 5: Split Data Equally
train_data = []
validation_data = []
test_data = []
for category in unique_categories:
    n = len(category_data[category])
    train, validate, test = np.split(category_data[category], [int(train_ratio * n), int((train_ratio + validation_ratio) * n)])
    train_data.append(train)
    validation_data.append(validate)
    test_data.append(test)

# Merge the split data from different categories
train_data = pd.concat(train_data)
validation_data = pd.concat(validation_data)
test_data = pd.concat(test_data)


In [None]:
print("Train data info:")
train_data.info()

print("Validation data info:")
validation_data.info()

print("Test data info:")
test_data.info()

In [None]:
train_data.head()

In [None]:
train_data.to_excel("Train_set.xlsx", index=False)

In [None]:
validation_data.head()

In [None]:
validation_data.to_excel("Validation_set.xlsx", index=False)

In [None]:
test_data.head()

In [None]:
test_data.to_excel("Test_set.xlsx", index=False)