In [1]:
import pandas as pd

In [2]:
cols_names = ['Tweet_ID','Company','Sentiment','Tweet']
df = pd.read_csv("twitter_training.csv", names=cols_names)
print(df.head())

   Tweet_ID      Company Sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                               Tweet  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [3]:
print(df.describe())
print(df.info())
print(df.shape)

           Tweet_ID
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet_ID   74682 non-null  int64 
 1   Company    74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None
(74682, 4)


In [4]:
# Conteo de sentimientos
print(df["Sentiment"].value_counts())

Sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [5]:
df.drop('Tweet_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,Company,Sentiment,Tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0,0
Company,0
Sentiment,0
Tweet,0


In [7]:
# Check for duplicates
duplicates = df.duplicated()
print(f"Number of duplicates for {df} dataset is {duplicates.sum()} rows")

Number of duplicates for            Company Sentiment  \
0      Borderlands  Positive   
1      Borderlands  Positive   
2      Borderlands  Positive   
3      Borderlands  Positive   
4      Borderlands  Positive   
...            ...       ...   
74677       Nvidia  Positive   
74678       Nvidia  Positive   
74679       Nvidia  Positive   
74680       Nvidia  Positive   
74681       Nvidia  Positive   

                                                   Tweet  
0      im getting on borderlands and i will murder yo...  
1      I am coming to the borders and I will kill you...  
2      im getting on borderlands and i will kill you ...  
3      im coming on borderlands and i will murder you...  
4      im getting on borderlands 2 and i will murder ...  
...                                                  ...  
74677  Just realized that the Windows partition of my...  
74678  Just realized that my Mac window partition is ...  
74679  Just realized the windows partition of my Mac ...  


In [8]:
review = []
sentences = list(df["Tweet"])
for sen in sentences:
    review.append(sen)

In [9]:
import numpy as np

labels = df["Sentiment"]
labels = np.array(list(map(lambda x: 1 if x=="Positive" else 0, labels)))
labels = np.array(list(map(lambda x: 2 if x=="Neutral" else 0, labels)))
labels = np.array(list(map(lambda x: 3 if x=="Negative" else 0, labels)))

In [21]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(review, labels, test_size=0.2)

In [22]:
# Parametros
vocab_size = 5000
embedding_dim = 32
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Inicio de la clase del tokenizador
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

In [24]:
# Generación de la palabra índice del diccionario
# para el entrenamiento de fantasmas

tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# Generación y relleno de la secuencia de entrenamiento
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences,maxlen = max_length, truncating = trunc_type)

# Generación y relleno de la secuencia de prueba
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences,maxlen = max_length, truncating = trunc_type)

In [32]:
# Construcción del modelo
import keras

model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.Dropout(0.25),
    keras.layers.LSTM(32),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1, activation='softmax')
])




In [33]:
# Configuración de los parámetros de entrenamiento
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

In [34]:
print(f"Size of padded: {len(padded)}")
print(f"Size of train_labels: {len(train_labels)}")

Size of padded: 59196
Size of train_labels: 59196


In [35]:
print(padded[0])
print(train_labels[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 1475  644  503 2329   41  515 3107  924   56  185
    5   91  174  176 3176  594   12    1]
0


In [None]:
# Entrenamiento del modelo
import matplotlib.pyplot as plt

# Check if the lengths match and adjust if necessary
'''if len(padded) != len(train_labels):
    min_length = min(len(padded), len(train_labels))
    padded = padded[:min_length]
    train_labels = train_labels[:min_length]
    print("Warning: Lengths of 'padded' and 'train_labels' were mismatched. Truncated to the shorter length.")
'''
history = model.fit(padded,
                    train_labels,
                    epochs=10,
                    validation_data=(test_padded, test_labels))

# Gráfico del historial de entrenamiento
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

Epoch 1/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 120ms/step - accuracy: 0.0000e+00 - loss: 0.0348 - val_accuracy: 0.0000e+00 - val_loss: 1.2432e-08
Epoch 2/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 121ms/step - accuracy: 0.0000e+00 - loss: 2.0431e-04 - val_accuracy: 0.0000e+00 - val_loss: 6.1191e-11
Epoch 3/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 121ms/step - accuracy: 0.0000e+00 - loss: 1.5623e-04 - val_accuracy: 0.0000e+00 - val_loss: 4.1620e-13
Epoch 4/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 123ms/step - accuracy: 0.0000e+00 - loss: 2.0668e-05 - val_accuracy: 0.0000e+00 - val_loss: 1.2725e-14
Epoch 5/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 119ms/step - accuracy: 0.0000e+00 - loss: 2.9362e-05 - val_accuracy: 0.0000e+00 - val_loss: 2.2185e-16
Epoch 6/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 119

In [None]:
predictions = model.predict(test_padded)

# Calculate errors (assuming binary classification)
errors = test_labels - predictions.flatten()

# Plot histogram of errors
import matplotlib.pyplot as plt
plt.hist(errors, bins=20)
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Histogram of Prediction Errors")
plt.show()