## Preprocessing

In [1]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/karinacampos/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karinacampos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
!pip3 install emoji


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from unicodedata import normalize
import pandas as pd

In [4]:
test_dataset = pd.read_csv('test.csv')
test_dataset.columns = ['Polarity', 'Title', 'Text']
test_dataset.head()

Unnamed: 0,Polarity,Title,Text
0,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
1,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
2,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
3,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
4,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...


In [5]:
train_dataset = pd.read_csv('train.csv')
train_dataset.columns = ['Polarity', 'Title', 'Text']
train_dataset.head()

Unnamed: 0,Polarity,Title,Text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [6]:
def preprocess_and_clean_dataset(df):

    def preprocess_text(text):
        text = re.sub(r'http\S+|www\S+|@\w+|#', '', text)
        text = normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        text = text.lower()
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'[^a-z\s]', '', text)

        tokens = word_tokenize(text)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

        return " ".join(tokens)

    df['Review'] = df['Title'].fillna('') + " " + df['Text'].fillna('')

    df['Review'] = df['Review'].apply(preprocess_text)


    df = df.drop_duplicates(subset='Review')

    df = df[['Polarity', 'Review']]
    return df

In [7]:
test_dataset = preprocess_and_clean_dataset(test_dataset)

In [8]:
train_dataset = preprocess_and_clean_dataset(train_dataset)

## classificator using TFIDF vectorization and Logistic Regression

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, KFold
import pickle

In [10]:
X_train = train_dataset["Review"]
y_train = train_dataset["Polarity"]
X_test = test_dataset["Review"]
y_test = test_dataset["Polarity"]

In [11]:
class TfidfLogisticRegression:
    def __init__(self, tfidf_params=None, log_reg_params=None):
        if tfidf_params is None:
            tfidf_params = {
                "min_df": 0.01,
                "ngram_range": (1, 2),
                "max_features": 10000,
            }
        
        if log_reg_params is None:
            log_reg_params = {
                "C": 1,
                "random_state": 42,
                "solver": "saga",
                "max_iter": 1000,
                "class_weight": "balanced",
            }

        self.pipeline = Pipeline([
            ("vectorizer", TfidfVectorizer(**tfidf_params)),
            ("classifier", LogisticRegression(**log_reg_params)),
        ])

    def train(self, X_train, y_train):
        """Entrena el modelo en el conjunto de entrenamiento."""
        self.pipeline.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        """Evalúa el modelo en el conjunto de prueba y muestra un reporte de clasificación."""
        y_pred = self.pipeline.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        return report

    def predict(self, X):
        """Realiza predicciones en nuevos datos."""
        return self.pipeline.predict(X)

    def cross_validate(self, X, y, cv=5):
        """Realiza validación cruzada usando K-Folds."""
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)
        scores = cross_val_score(self.pipeline, X, y, cv=kf, scoring="f1_macro")
        print(f"F1 Macro-averaged scores across folds: {scores}")
        print(f"Mean F1 Macro-averaged score: {scores.mean()}")
        return scores

    def save_model(self, file_name):
        """Guarda el modelo entrenado en un archivo."""
        with open(file_name, "wb") as file:
            pickle.dump(self.pipeline, file)
        print(f"Model successfully saved in {file_name}.")


In [None]:
model = TfidfLogisticRegression()

model.train(X_train, y_train)


In [None]:
model.evaluate(X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

           1       0.85      0.85      0.85    199984
           2       0.85      0.85      0.85    199996

    accuracy                           0.85    399980
   macro avg       0.85      0.85      0.85    399980
weighted avg       0.85      0.85      0.85    399980



{'1': {'precision': 0.8513296053425594,
  'recall': 0.8465277222177774,
  'f1-score': 0.8489218734329556,
  'support': 199984.0},
 '2': {'precision': 0.847397625345558,
  'recall': 0.8521770435408709,
  'f1-score': 0.849780614280016,
  'support': 199996.0},
 'accuracy': 0.8493524676233811,
 'macro avg': {'precision': 0.8493636153440587,
  'recall': 0.8493523828793241,
  'f1-score': 0.8493512438564857,
  'support': 399980.0},
 'weighted avg': {'precision': 0.8493635563614096,
  'recall': 0.8493524676233811,
  'f1-score': 0.8493512567382426,
  'support': 399980.0}}

In [22]:
model.cross_validate(X_train, y_train)

F1 Macro-averaged scores across folds: [0.84922116 0.84948903 0.84942077 0.84933732 0.84933358]
Mean F1 Macro-averaged score: 0.8493603725321665


array([0.84922116, 0.84948903, 0.84942077, 0.84933732, 0.84933358])

## Create a Neural Network model class to generate the keras model needed for this task.

In [15]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [19]:
class NN:
    def __init__(self, max_words=20000, max_len=100, embedding_dim=128):
        self.max_words = max_words  # Vocabulary size
        self.max_len = max_len  # Maximum sequence length
        self.embedding_dim = embedding_dim  # Embedding dimension
        self.tokenizer = Tokenizer(num_words=self.max_words)  # Tokenizer for text preprocessing
        self.model = None  # Placeholder for the Keras model

    def preprocess_data(self, X, y):
        # Tokenize and convert text to sequences
        self.tokenizer.fit_on_texts(X)
        sequences = self.tokenizer.texts_to_sequences(X)
        X_padded = pad_sequences(sequences, maxlen=self.max_len, padding='post')
        y_encoded = np.array(y) - 1  # Convert classes (1, 2) to (0, 1) for training
        return X_padded, y_encoded

    def build_model(self):
        # Define the model architecture
        model = Sequential([
            Embedding(self.max_words, self.embedding_dim, input_length=self.max_len),
            GlobalAveragePooling1D(),
            Dense(128, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')  # Sigmoid for binary classification
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.model = model

    def train_with_kfold(self, X, y, k=5, epochs=5, batch_size=32):
        # Perform K-Fold Cross-Validation
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        fold_results = []

        for fold, (train_index, val_index) in enumerate(kf.split(X)):
            print(f"Training on fold {fold + 1}/{k}")
            # Extract train and validation sets
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            # Build and compile the model for each fold
            self.build_model()
            
            # Ensure labels have the correct shape for binary_crossentropy
            y_train = np.array(y_train).reshape(-1, 1)
            y_val = np.array(y_val).reshape(-1, 1)

            # Train the model
            self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

            # Predict on validation data
            y_val_pred = (self.model.predict(X_val) > 0.5).astype("int32")
            acc = accuracy_score(y_val, y_val_pred)
            fold_results.append(acc)
            print(f"Fold {fold + 1} Accuracy: {acc:.4f}")

        print(f"\nAverage Accuracy across {k} folds: {np.mean(fold_results):.4f}")

    def evaluate(self, X_test, y_test):
        # Evaluate the model on the test set
        y_pred = (self.model.predict(X_test) > 0.5).astype("int32")
        report = classification_report(y_test, y_pred, target_names=["Negative", "Positive"])
        print("\nClassification Report:\n", report)


In [20]:
nn_model = NN()
X_train, y_train = nn_model.preprocess_data(train_dataset['Review'], train_dataset['Polarity'])
X_test, y_test = nn_model.preprocess_data(test_dataset['Review'], test_dataset['Polarity'])


In [23]:
nn_model.train_with_kfold(X_train, y_train, k=2, epochs=5, batch_size=32)


Training on fold 1/2
Epoch 1/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 10ms/step - accuracy: 0.8633 - loss: 0.3147
Epoch 2/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m840s[0m 15ms/step - accuracy: 0.8909 - loss: 0.2650
Epoch 3/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m668s[0m 12ms/step - accuracy: 0.8964 - loss: 0.2547
Epoch 4/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1064s[0m 19ms/step - accuracy: 0.8994 - loss: 0.2489
Epoch 5/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m635s[0m 11ms/step - accuracy: 0.9017 - loss: 0.2438
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 403us/step
Fold 1 Accuracy: 0.8963
Training on fold 2/2




Epoch 1/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 13ms/step - accuracy: 0.8640 - loss: 0.3129
Epoch 2/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1295s[0m 23ms/step - accuracy: 0.8911 - loss: 0.2645
Epoch 3/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m452s[0m 8ms/step - accuracy: 0.8962 - loss: 0.2552
Epoch 4/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 8ms/step - accuracy: 0.8994 - loss: 0.2486
Epoch 5/5
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 8ms/step - accuracy: 0.9021 - loss: 0.2430
[1m56225/56225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 422us/step
Fold 2 Accuracy: 0.8988

Average Accuracy across 2 folds: 0.8975


In [25]:
nn_model.evaluate(X_test, y_test)


[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 431us/step

Classification Report:
               precision    recall  f1-score   support

    Negative       0.78      0.77      0.77    199984
    Positive       0.77      0.78      0.77    199996

    accuracy                           0.77    399980
   macro avg       0.77      0.77      0.77    399980
weighted avg       0.77      0.77      0.77    399980

