## Part 1 - Data Loading

In [None]:
import pandas as pd
data=pd.read_csv("IMDB Dataset.csv")

## Part 2 - Text Cleaning

In [None]:
import re

def simple_preprocessor(text):
    text = re.sub('<.*?>', '', text)      # Remove HTML tags
    text = re.sub('[\W]+', ' ', text)     # Remove non-word characters
    text = text.lower()                   # Convert text to lowercase
    return text

data_clean = data.copy()  # Copying to avoid modifying the original
data_clean["review"] = data["review"].apply(simple_preprocessor)  # Applying the preprocessing to each row

## Part 3 - Building and Training the Pipeline

In [None]:
data_train=data_clean[:35000]
data_test=data_clean[35000:]

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import contractions
from sklearn.preprocessing import FunctionTransformer

def no_contractions(df):
    return df.apply(lambda x: contractions.fix(x))

pipeline_7=make_pipeline(FunctionTransformer(no_contractions),CountVectorizer(stop_words=stop_words, ngram_range=(1, 2), max_features=35000),
    TfidfTransformer(sublinear_tf=True),LogisticRegression())

pipeline_7.fit(data_train["review"],data_train["sentiment"])

[nltk_data] Downloading package stopwords to /home/jess/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Part 4 - Saving the Model

In [None]:
import joblib
joblib.dump(pipeline_7, "pipeline_7.pkl")

['modelo_7.pkl']

## Part 5 - Loading the Model

In [None]:
pipeline_7_reloaded = joblib.load("pipeline_7.pkl")
pipeline_7_reloaded.fit(data_train["review"], data_train["sentiment"])

In [None]:
data_test_predict_7= pipeline_7_reloaded.predict(data_test["review"])

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(data_test["sentiment"], data_test_predict_7)
print("Accuracy on the Test Set:", accuracy)

Accuracy en el conjunto de test: 0.9026
