In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
file_path = 'TRAINING_DATA.txt'

# Initialize lists to store labels and sentences
labels = []
sentences = []

# Read the text file line by line
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split each line into label and sentence using tab as the delimiter
        label, sentence = line.strip().split('\t')
        labels.append(int(label))
        sentences.append(sentence)

# Create a DataFrame
data = pd.DataFrame({'label': labels, 'text': sentences})

# Display the first few rows of the DataFrame
print(data.head())


   label                                               text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...


In [3]:
data.head()

Unnamed: 0,label,text
0,1,"Cuando conocí a Janice en 2013 , una familia n..."
1,0,Hwang habló en Sur de este año por Southwest M...
2,1,Usted podría pensar Katy Perry y Robert Pattin...
3,1,Cualquiera que haya volado los cielos del crea...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p..."


TEXT PREPROCESSING

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw')

[nltk_data] Downloading package stopwords to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

GRADIENT BOOST WITH TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

# stopwords
nltk.download('stopwords')

# preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('spanish'))
    stemmer = SnowballStemmer('spanish')
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the dataset
data['text'] = data['text'].apply(preprocess_text)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# pipeline with TF-IDF and Gradient Boosting classifier
pipeline_gb = Pipeline([
    ('vect', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('clf', GradientBoostingClassifier(n_estimators=200, learning_rate=0.01))
])

# Train the pipeline
pipeline_gb.fit(X_train, y_train)

# Accuracy score on test set
accuracy_gb_test = pipeline_gb.score(X_test, y_test)
print("Accuracy with TF-IDF on Test Set (Gradient Boosting):", accuracy_gb_test)



[nltk_data] Downloading package stopwords to C:\Users\Francesco
[nltk_data]     Corda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy with TF-IDF on Test Set (Gradient Boosting): 0.5209731543624161
