In [11]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/leochc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/leochc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
data = pd.read_csv("comentarios_pelicula.csv")
len(data)

50000

In [35]:
data.info

<bound method DataFrame.info of                                                 review sentiment  label
0    one reviewers mentioned watching 1 oz episode ...  positive      1
1    wonderful little production . filming techniqu...  positive      1
2    thought wonderful way spend time hot summer we...  positive      1
3    basically 's family little boy ( jake ) thinks...  negative      0
4    petter mattei 's `` love time money '' visuall...  positive      1
..                                                 ...       ...    ...
995  nothing sacred . ask ernie fosselius . days , ...  positive      1
996  hated . hate self-aware pretentious inanity ma...  negative      0
997  usually try professional constructive criticiz...  negative      0
998  like going see film history class something li...  negative      0
999  like zoology textbook , given depiction animal...  negative      0

[1000 rows x 3 columns]>

In [16]:
#solo trabajamos con los 1000 priemeros datos
data = data[:1000]
len(data)

1000

In [17]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [18]:
#se discretiza la variable sentiment para obtener valores numericos
data['label'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)


In [19]:
data

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
995,Nothing is sacred. Just ask Ernie Fosselius. T...,positive,1
996,I hated it. I hate self-aware pretentious inan...,negative,0
997,I usually try to be professional and construct...,negative,0
998,If you like me is going to see this in a film ...,negative,0


In [36]:
#definimos la funcion para preporcesar los comentarios
def preprocess_text(text):
    # Eliminar caracteres especiales
    text = text.replace('<br /><br />', '')
    # Convertir a minúsculas
    text = text.lower()
    # Tokenización
    tokens = nltk.word_tokenize(text)
    # Eliminar stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Limitar a las primeras 150 palabras
    tokens = tokens[:150]
    # Unir tokens nuevamente en una cadena
    processed_text = ' '.join(tokens)
    return processed_text

In [21]:
#aplicamos el preporcesamiento a los datos
data['review'] = data['review'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review'] = data['review'].apply(preprocess_text)


In [22]:
data

Unnamed: 0,review,sentiment,label
0,one reviewers mentioned watching 1 oz episode ...,positive,1
1,wonderful little production . filming techniqu...,positive,1
2,thought wonderful way spend time hot summer we...,positive,1
3,basically 's family little boy ( jake ) thinks...,negative,0
4,petter mattei 's `` love time money '' visuall...,positive,1
...,...,...,...
995,"nothing sacred . ask ernie fosselius . days , ...",positive,1
996,hated . hate self-aware pretentious inanity ma...,negative,0
997,usually try professional constructive criticiz...,negative,0
998,like going see film history class something li...,negative,0


In [23]:
# Crear embeddings utilizando word2vec
sentences = [nltk.word_tokenize(review) for review in data['review']]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [24]:
# Crear matriz de características
X = np.zeros((len(sentences), 100))
for i, sentence in enumerate(sentences):
    vec = np.zeros((100,))
    for word in sentence:
        if word in model.wv:
            vec += model.wv[word]
    X[i] = vec

In [25]:
# Dividir los datos en entrenamiento y prueba
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Entrenar un modelo de regresión logística
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Predecir etiquetas para los datos de prueba
y_pred = classifier.predict(X_test)

In [28]:
# Calcular la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print("Precisión:", accuracy)

Precisión: 0.67
