# NLP Classification Challenge Lab

## **1. Import Libraries and Initial Setup**
# Import the required libraries.
# Examples: pandas, numpy, sklearn, matplotlib, etc.

In [None]:
##pip install pandas matplotlib scikit-learn numpy seaborn nltk spacy textblob joblib tqdm keras##

In [22]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## **2. Load and Explore Data**
# Load the training and real data from provided text files.
# Examples: Use pandas to load the datasets.

In [11]:
# Read the file line by line
# Use regex to split the label (first digit) and the text
# Append the cleaned data into a list
data_pre = []
with open('TRAINING_DATA.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Match a single digit at the beginning of the line followed by spaces and text
        match = re.match(r'^(\d)\s+(.*)$', line)
        if match:
            label, text = match.groups()
            # Append the label and text as a tuple
            data_pre.append((int(label), text.strip()))

# Convert the list into a DataFrame
data = pd.DataFrame(data_pre, columns=['label', 'text'])

# Display the DataFrame and its structure
display(data.head(), data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   17877 non-null  int64 
 1   text    17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB


Unnamed: 0,label,text
0,1,"Cuando conocí a Janice en 2013 , una familia n..."
1,0,Hwang habló en Sur de este año por Southwest M...
2,1,Usted podría pensar Katy Perry y Robert Pattin...
3,1,Cualquiera que haya volado los cielos del crea...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p..."


None

## **3. Data Pre-Preprocessing**
# Steps for data split into train and test:

In [12]:
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (14301,) (14301,)
Test set: (3576,) (3576,)


## **4. Data Preprocessing**
# Steps for cleaning and preparing the text data:
# 4.1. Lowercasing


In [13]:
def Lowercase_cleaning(text):
    text = text.lower()
    
    return text

# Apply the cleaning to the training and test sets
X_train_cleaned = X_train.apply(Lowercase_cleaning)
X_test_cleaned = X_test.apply(Lowercase_cleaning)

# Display the first 5 rows of the cleaned sets
print("Cleaned Training Data:")
print(X_train_cleaned.head())

print("\nCleaned Test Data:")
print(X_test_cleaned.head())

Cleaned Training Data:
764      a pesar de que el molusco contagioso es solo u...
10994    ella quería que leyera un cuento en voz alta p...
16064    a continuación , empezar de nuevo y hacer todo...
17611    esta corriente de distracciones tamaño-bite es...
5999     aplicaciones son realmente necesarios para con...
Name: text, dtype: object

Cleaned Test Data:
10393    por qué no probar su vagina como éste de 19 añ...
16172    tuve que poner por escrito el libro , levantar...
7298     " [ esto ] hubiera sido mejor para el público ...
17586    en muchos sentidos , son el bar de la industri...
563      no ha dejado cada vez mayor , ni tiene su tole...
Name: text, dtype: object


# 4.2. Removing special characters, punctuation, numbers, and stopwords


    - Accents, special charcaters, extra spaces, and numbers. (Note: Maintaining the "ñ" character)

In [14]:
import re

def remove_accents_but_keep_ñ(text):
    # Replace accented vowels with their unaccented counterparts
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ü': 'u',
        'Á': 'a', 'É': 'e', 'Í': 'i', 'Ó': 'o', 'Ú': 'u', 'Ü': 'u'
    }
    
    # Create a regex pattern for accented characters
    pattern = re.compile('|'.join(re.escape(char) for char in replacements.keys()))
    
    # Replace accented characters using the dictionary
    text = pattern.sub(lambda match: replacements[match.group(0)], text)
    
    return text

def normalize_text(text):
    # Step 1: Remove accents but keep "ñ"
    text = remove_accents_but_keep_ñ(text)
    
    # Step 2: Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Step 3: Remove special characters but keep letters, spaces, and "ñ"
    text = re.sub(r'[^a-zñ ]', '', text)
    
    # Step 4: Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply normalization and cleaning to training and test datasets
X_train_cleaned = X_train_cleaned.apply(normalize_text)
X_test_cleaned = X_test_cleaned.apply(normalize_text)

# Display the first rows of the cleaned data
print("Cleaned Training Data:")
print(X_train_cleaned.head())
print("\nCleaned Test Data:")
print(X_test_cleaned.head())

Cleaned Training Data:
764      a pesar de que el molusco contagioso es solo u...
10994    ella queria que leyera un cuento en voz alta p...
16064    a continuacion empezar de nuevo y hacer todo d...
17611    esta corriente de distracciones tamañobite es ...
5999     aplicaciones son realmente necesarios para con...
Name: text, dtype: object

Cleaned Test Data:
10393    por que no probar su vagina como este de años ...
16172    tuve que poner por escrito el libro levantarse...
7298     esto hubiera sido mejor para el publico de man...
17586    en muchos sentidos son el bar de la industria ...
563      no ha dejado cada vez mayor ni tiene su tolera...
Name: text, dtype: object


    - Spanish Stopwords

In [15]:
###Stopwords Removal Process###
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("spanish")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('spanish')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros']


In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\larry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\larry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# Define a function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('spanish'))  # Set of English stopwords
    words = word_tokenize(text)  # Tokenize the text into words
    filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)  # Reconstruct the text without stopwords

# Apply the function to the training and test sets
X_train_cleaned = X_train_cleaned.apply(remove_stopwords)
X_test_cleaned = X_test_cleaned.apply(remove_stopwords)

# Display the first 5 rows of the cleaned training set
print("Training Data after Stopwords Removal:")
print(X_train_cleaned.head())

print("\nTest Data after Stopwords Removal:")
print(X_test_cleaned.head())

Training Data after Stopwords Removal:
764      pesar molusco contagioso solo infeccion viral ...
10994    queria leyera cuento voz alta pudieran oir dos...
16064               continuacion empezar nuevo hacer nuevo
17611    corriente distracciones tamañobite justo tipo ...
5999     aplicaciones realmente necesarios construir ut...
Name: text, dtype: object

Test Data after Stopwords Removal:
10393              probar vagina años chica tennessee hizo
16172    poner escrito libro levantarse caminar alreded...
7298     sido mejor publico manera luego pueden procesa...
17586    bar industria noticias celebridades bien estan...
563      dejado cada vez mayor tolerancia perdurable se...
Name: text, dtype: object


# 4.3. Vectorization (e.g., CountVectorizer, TfidfVectorizer)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_cleaned)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test_cleaned)

# Print the shapes of the vectorized datasets
print("Shape of training data (TF-IDF):", X_train_tfidf.shape)
print("Shape of test data (TF-IDF):", X_test_tfidf.shape)

Shape of training data (TF-IDF): (14301, 22783)
Shape of test data (TF-IDF): (3576, 22783)


## **5. Model Selection and Training**
# Train the following models:
# - SVC Model and Evaluation


In [19]:
from sklearn.svm import SVC
svc = SVC() # svc = SVC(kernel = 'linear', random_state = 42) = 0.75, 0.40
svc.fit(X_train_tfidf, y_train)

In [20]:
#Predicitons with training and test data
y_svc_train_pred = svc.predict(X_train_tfidf)
y_svc_test_pred = svc.predict(X_test_tfidf)

In [23]:
# Accuracy Score
svc_train_accuracy = accuracy_score(y_train, y_svc_train_pred)
svc_test_accuracy = accuracy_score(y_test, y_svc_test_pred)

print("Train accuracy_score:", svc_train_accuracy)
print("Test accuracy_score:", svc_test_accuracy)

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_svc_test_pred))

Train accuracy_score: 0.8788196629606321
Test accuracy_score: 0.2829977628635347

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.27      0.27      1751
           1       0.30      0.29      0.29      1825

    accuracy                           0.28      3576
   macro avg       0.28      0.28      0.28      3576
weighted avg       0.28      0.28      0.28      3576



# - Multinomial Naive Bayes and Evaluation

In [24]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
MNB = MultinomialNB()

# Train the classifier on the training data
MNB.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_MNB_pred = MNB.predict(X_test_tfidf)

# Print the predictions (optional)
print("Predictions on the test set:")
print(y_MNB_pred)

Predictions on the test set:
[0 0 0 ... 1 0 0]


In [25]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_MNB_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_MNB_pred))

Accuracy: 0.33

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.42      0.38      1751
           1       0.31      0.25      0.28      1825

    accuracy                           0.33      3576
   macro avg       0.33      0.33      0.33      3576
weighted avg       0.33      0.33      0.33      3576



# - Vectorizer & Multinomial Naive Bayes Pipeline plus Evaluation

In [28]:
print(type(X_train_cleaned))
print(X_train_cleaned.head())
print(type(y_train))
print(y_train.head())

<class 'pandas.core.series.Series'>
764      pesar molusco contagioso solo infeccion viral ...
10994    queria leyera cuento voz alta pudieran oir dos...
16064               continuacion empezar nuevo hacer nuevo
17611    corriente distracciones tamañobite justo tipo ...
5999     aplicaciones realmente necesarios construir ut...
Name: text, dtype: object
<class 'pandas.core.series.Series'>
764      1
10994    1
16064    0
17611    1
5999     0
Name: label, dtype: int64


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Define function to train and evaluate the classifier
def train_and_evaluate(vectorizer):
    # Create pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', MultinomialNB())
    ])
    
    # Train the classifier
    pipeline.fit(X_train_cleaned, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test_cleaned)
    
    # Evaluate the model
    combo_accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {combo_accuracy:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return combo_accuracy

# Try different vectorizers
print("Using TfidfVectorizer:")
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 2))
train_and_evaluate(tfidf_vectorizer)

print("\nUsing CountVectorizer:")
count_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
train_and_evaluate(count_vectorizer)

Using TfidfVectorizer:
Accuracy: 0.33

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.40      0.37      1751
           1       0.31      0.26      0.29      1825

    accuracy                           0.33      3576
   macro avg       0.33      0.33      0.33      3576
weighted avg       0.33      0.33      0.33      3576


Using CountVectorizer:
Accuracy: 0.22

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.25      0.24      1751
           1       0.21      0.20      0.20      1825

    accuracy                           0.22      3576
   macro avg       0.22      0.22      0.22      3576
weighted avg       0.22      0.22      0.22      3576



0.22231543624161074

# - Basic XGBoost Classifier and Evaluation

In [None]:
# Install XGBoost if not already installed
#!pip install xgboost

In [30]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the model
xgb_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Model Accuracy: {accuracy_xgb:.2f}")

# Print classification report
print(classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



XGBoost Model Accuracy: 0.45
              precision    recall  f1-score   support

           0       0.42      0.31      0.36      1751
           1       0.47      0.58      0.52      1825

    accuracy                           0.45      3576
   macro avg       0.45      0.45      0.44      3576
weighted avg       0.45      0.45      0.44      3576



# - XGBoost Classifier using class weights and Evaluation

In [31]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Initialize XGBoost with class weights
xgb_cw_model = XGBClassifier(scale_pos_weight=class_weights_dict[1], use_label_encoder=False, eval_metric='mlogloss')
xgb_cw_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred_xgb_cw = xgb_cw_model.predict(X_test_tfidf)
accuracy_xgb_cw = accuracy_score(y_test, y_pred_xgb_cw)
print(f"Balanced XGBoost Accuracy: {accuracy_xgb_cw:.2f}")

Parameters: { "use_label_encoder" } are not used.



Balanced XGBoost Accuracy: 0.45


# - XGBoost Classifier (Parameters Adjusted) and Evaluation

In [32]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Manual Adjustment of Parammeters
xgb_param_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Train Model
xgb_param_model.fit(X_train_tfidf, y_train)

# Predict
y_pred_xgb_param = xgb_param_model.predict(X_test_tfidf)

# Evaluate
accuracy_xgb_param = accuracy_score(y_test, y_pred_xgb_param)
print(f"XGBoost Manual Adjustment Accuracy: {accuracy_xgb_param:.2f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Manual Adjustment Accuracy: 0.45


In [33]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Manual Adjustment of Parammeters
xgb_param2_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=25,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Train Model
xgb_param2_model.fit(X_train_tfidf, y_train)

# Predict
y_pred_xgb_param2 = xgb_param2_model.predict(X_test_tfidf)

# Evaluate
accuracy_xgb_param2 = accuracy_score(y_test, y_pred_xgb_param2)
print(f"XGBoost Manual Adjustment Accuracy: {accuracy_xgb_param2:.2f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Manual Adjustment Accuracy: 0.40


# - BERT Pretrained Model (including Tokenizer process) and Evaluation

In [None]:
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [34]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # True if GPU is available, otherwise False

2.5.1+cpu
False


In [35]:
import torch
import transformers
print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.5.1+cpu
Transformers version: 4.47.1


In [36]:
from transformers import BertTokenizer, BertForSequenceClassification

# Define the model name
MODEL_NAME = "bert-base-multilingual-cased"  # You can change this to another model if necessary

# Load the tokenizer and the pre-trained model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)  # Adjust num_labels for your task

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    """Dataset for tokenized inputs and labels"""
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create datasets
train_dataset = CustomDataset(X_train_cleaned, y_train, tokenizer)
test_dataset = CustomDataset(X_test_cleaned, y_test, tokenizer)

In [38]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    num_train_epochs=3,              # Number of epochs
    per_device_train_batch_size=16,  # Training batch size
    per_device_eval_batch_size=64,   # Evaluation batch size
    warmup_steps=500,                # Warmup steps for learning rate
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir="./logs",            # Directory for logging
    eval_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model checkpoints at each epoch
    logging_steps=10,                # Log every 10 steps
    load_best_model_at_end=True      # Load the best model at the end
)

In [40]:
print(X_train_cleaned.index.equals(y_train.index))
print(X_test_cleaned.index.equals(y_test.index))

True
True


In [42]:
from torch.utils.data import TensorDataset

def tokenize_data(texts, labels):
    # Convert labels to a list if needed
    labels = labels.tolist() if isinstance(labels, pd.Series) else labels

    # Tokenize the text data
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Create a TensorDataset
    dataset2 = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))
    return dataset2

In [43]:
train_dataset2 = tokenize_data(X_train_cleaned, y_train)
test_dataset2 = tokenize_data(X_test_cleaned, y_test)

In [44]:
print(type(train_dataset2))  # Debe ser un tipo de dataset compatible con Trainer
print(len(train_dataset2))  # Tamaño del dataset
print(train_dataset2[0])  # Una muestra del dataset (input_ids, attention_mask, label)

<class 'torch.utils.data.dataset.TensorDataset'>
14301
(tensor([  101, 18066, 39520, 10251, 10812, 23020, 64867, 10133, 11395, 10106,
        14601, 77730, 82356, 21610, 88734, 11135, 10112, 20151, 13771, 11135,
        14652, 10115, 10131, 10107, 10485, 16354, 11783, 36064, 13236, 23005,
        10305, 34026,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        

In [45]:
from transformers import Trainer

# Define the Trainer
trainer = Trainer(
    model=model,                      # Pre-trained BERT model
    args=training_args,               # Training arguments
    train_dataset=train_dataset2,      # Training dataset
    eval_dataset=test_dataset2         # Test dataset for evaluation
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

  0%|          | 0/2682 [10:36<?, ?it/s]
  0%|          | 0/2682 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute

## **6. Real Data Reading and Processing**

In [46]:
# Read the file line by line
# Use regex to split the label (first digit) and the text
# Append the cleaned data into a list
real_data_pre = []
with open('REAL_DATA.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Match a single digit at the beginning of the line followed by spaces and text
        match = re.match(r'^(\d)\s+(.*)$', line)
        if match:
            label, text = match.groups()
            # Append the label and text as a tuple
            real_data_pre.append((int(label), text.strip()))

# Convert the list into a DataFrame
real_data = pd.DataFrame(real_data_pre, columns=['label', 'text'])

# Display the DataFrame and its structure
display(real_data.head(), real_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2201 entries, 0 to 2200
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2201 non-null   int64 
 1   text    2201 non-null   object
dtypes: int64(1), object(1)
memory usage: 34.5+ KB


Unnamed: 0,label,text
0,2,Yo no creo que a nadie le haya encantado un pe...
1,2,No va a resolver sus problemas de crédito o me...
2,2,Te encantará este !
3,2,Yo estaba a volar a un aeropuerto varias horas...
4,2,"( Maid En Manhattan , The Wedding Planner , Je..."


None

In [47]:
# Transform the real data
real_X_tfidf_vectorizer = tfidf_vectorizer.transform(real_data['text'])

## **7. Predictions on Real Data**
# Write code to load new data (unlabeled) and make predictions using the best model.
# Save the results into a TXT file.