In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jesus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jesus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load datasets
data_train = pd.read_csv("C:\\Users\\jesus\\project-3-nlp\\TRAINING_DATA.txt", 
                         encoding='latin-1', header=None, names=['text', 'label'], on_bad_lines='skip')
data_val = pd.read_csv("C:\\Users\\jesus\\project-3-nlp\\REAL_DATA.txt", 
                       encoding='latin-1', header=None, names=['text', 'label'], on_bad_lines='skip')

# Reduce the training set for faster testing
data_train = data_train.head(1000)

# Combine datasets
data = pd.concat([data_val, data_train], ignore_index=True)

# Preview combined dataset
print("\nCombined Dataset Preview:")
display(data.head())



Combined Dataset Preview:


Unnamed: 0,text,label
0,2\tYo no creo que a nadie le haya encantado un...,
1,2\tNo va a resolver sus problemas de crÃ©dito ...,
2,2\tTe encantarÃ¡ este !,
3,2\tMi padre llegÃ³ con la primera ola de fuerz...,6 de junio 1944 .
4,2\tY podemos todos estar de acuerdo que los en...,


In [5]:
# Function to clean HTML content
def clean_html(text):
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL)
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    return BeautifulSoup(text, "html.parser").get_text()

# Function to preprocess text
def preprocess_text(text):
    text = clean_html(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'^\s*[a-zA-Z]\s+', '', text)  # Remove single characters at the start
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)
    stop_words = set(stopwords.words("spanish"))
    words = [word for word in words if word not in stop_words]
    stemmer = SnowballStemmer('spanish')
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)



In [7]:
# Handle missing values in the 'text' column
data['text'] = data['text'].fillna('').astype(str)

# Apply preprocessing
data['preprocessed_text'] = data['text'].apply(preprocess_text)

# Preview preprocessed data
print("\nPreprocessed Dataset Preview:")
display(data.head())



Preprocessed Dataset Preview:


Unnamed: 0,text,label,preprocessed_text
0,2\tYo no creo que a nadie le haya encantado un...,,cre nadi encant pen flcid
1,2\tNo va a resolver sus problemas de crÃ©dito ...,,va resolv problem crdit mejor relacin padr
2,2\tTe encantarÃ¡ este !,,encant
3,2\tMi padre llegÃ³ con la primera ola de fuerz...,6 de junio 1944 .,padr lleg primer ola fuerz ali da
4,2\tY podemos todos estar de acuerdo que los en...,,pod acuerd envas mient


In [9]:
# Split the dataset
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

# Drop rows with missing labels
data_train = data_train.dropna(subset=['label'])
data_val = data_val.dropna(subset=['label'])

# Ensure labels are strings
data_train['label'] = data_train['label'].astype(str)
data_val['label'] = data_val['label'].astype(str)

# Check the shape of the datasets
print(f"Training data shape: {data_train.shape}")
print(f"Validation data shape: {data_val.shape}")


Training data shape: (605, 3)
Validation data shape: (137, 3)


In [11]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit-transform the training set and transform the validation set
X_train_tfidf = vectorizer.fit_transform(data_train['preprocessed_text'])
X_val_tfidf = vectorizer.transform(data_val['preprocessed_text'])

# Check the shape of the feature matrices
print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Validation features shape: {X_val_tfidf.shape}")


Training features shape: (605, 1663)
Validation features shape: (137, 1663)


In [13]:
# Initialize and train the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, data_train['label'])

# Predict on the validation set
y_pred = clf.predict(X_val_tfidf)

# Calculate accuracy
accuracy = accuracy_score(data_val['label'], y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.00
