# ⚜️ 《Sentiment140》
[kaggle - sentiment140](https://www.kaggle.com/datasets/kazanova/sentiment140)  
[tensorflow-dataset-sentiment140 (variation)](https://www.tensorflow.org/datasets/catalog/sentiment140)

### 问题分析

- 任务目标  
  - 输入：一条推文文本（如 "I love this movie! #happy"）
  - 输出：二分类情感标签（0=负面, 1=正面）

- 数据特点  
  - 数据量较大（160万条推文），适合练习大规模文本处理。
  - 推文包含噪音（如表情符号、话题标签、@用户名等），需清洗。

In [1]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

EMBEDDING_DIM = 100
MAX_LENGTH = 32
TRAINING_SPLIT = 0.9
BATCH_SIZE = 128

### 数据加载与预处理

- 目标变量0和4 优化为 0和1
- text文本数据清洗

In [2]:
import tensorflow_datasets as tfds

ds = tfds.load('sentiment140', split='train', shuffle_files=True)

# 将 tf.data.Dataset 转换为 Pandas DataFrame
def dataset_to_dataframe(dataset):
    # 将 dataset 转换为 Pandas DataFrame
    df = pd.DataFrame(list(dataset.as_numpy_iterator()))
    return df

# 转换
df = dataset_to_dataframe(ds)

# 查看前几行数据
df.head(2)

2025-05-04 23:54:10.676746: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-05-04 23:55:44.227927: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Unnamed: 0,date,polarity,query,text,user
0,b'Tue Jun 16 19:38:56 PDT 2009',0,b'NO_QUERY',b'ouch! i just burned my finger so bad ',b'MISS_NIKKITA'
1,b'Sun Jun 07 19:10:46 PDT 2009',4,b'NO_QUERY',"b""@MCeeYOSHi that's my movie ! """,b'imBIGtrouble'


In [3]:
df.polarity.value_counts()

polarity
0    800000
4    800000
Name: count, dtype: int64

In [4]:
# df['polarity'] = df.polarity.apply(lambda x: 0 if x == 0 else 1).to_numpy()
df["polarity"] = df["polarity"].replace(4, 1) 
df.polarity.value_counts()

polarity
0    800000
1    800000
Name: count, dtype: int64

In [5]:
import re
texts = df["text"].values

# 文本清洗函数
def clean_text(text):
    # 确保文本是字符串类型
    text = text.decode('utf-8') if isinstance(text, bytes) else text
    # 移除@用户名
    text = re.sub(r"@\w+", "", text)          
    # 移除URL
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  
    # 只保留字母和空格
    text = re.sub(r"[^a-zA-Z\s]", "", text)   
    # 转为小写
    text = text.lower().strip()                
    return text

# 应用清洗
cleaned_texts = [clean_text(text) for text in texts]

In [6]:
sentences = np.array(cleaned_texts)
labels = df['polarity'].to_numpy()
dataset = tf.data.Dataset.from_tensor_slices((sentences, labels))

for i, (sentence, label) in enumerate(dataset.take(2)):
    print(f"Sample {i+1}:")
    print(f"Sentence: {sentence.numpy().decode('utf-8')}")
    print(f"Label: {label.numpy()}")
    print("-" * 50)

Sample 1:
Sentence: ouch i just burned my finger so bad
Label: 0
--------------------------------------------------
Sample 2:
Sentence: thats my movie
Label: 1
--------------------------------------------------


2025-05-04 23:55:51.318501: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
# 设置训练集比例
# 计算训练集大小
total_size = len(sentences)
train_size = int(total_size * TRAINING_SPLIT)
val_size = total_size - train_size

# 打乱数据集（使用 buffer_size 作为参数，推荐设置为数据集大小）
dataset = dataset.shuffle(buffer_size=total_size, reshuffle_each_iteration=True)

# 将数据集拆分为训练集和验证集
train_dataset = dataset.take(train_size)
validation_dataset = dataset.skip(train_size)
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
train_dataset = (train_dataset
                   .shuffle(10000)
                   .cache()
                   .prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
                   .batch(BATCH_SIZE)
                   )
validation_dataset = (validation_dataset
                  .cache()
                  .prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
                  .batch(BATCH_SIZE)
                  )

print(f"There are {len(train_dataset)} batches for a total of {BATCH_SIZE*len(train_dataset)} elements for training.\n")
print(f"There are {len(validation_dataset)} batches for a total of {BATCH_SIZE*len(validation_dataset)} elements for validation.\n")

There are 11250 batches for a total of 1440000 elements for training.

There are 1250 batches for a total of 160000 elements for validation.



### NLP


In [11]:
def fit_vectorizer(dataset):
    vectorizer = tf.keras.layers.TextVectorization( 
        # max_tokens=10000, # 生成的矢量化词库的最大词数
        output_sequence_length=MAX_LENGTH,
        standardize='lower_and_strip_punctuation'
    ) 
    full_tokens = dataset.map(lambda x, y: x)
    vectorizer.adapt(full_tokens)
    return vectorizer

In [12]:
# Adapt the vectorizer to the training sentences
vectorizer = fit_vectorizer(train_dataset)
# Check size of vocabulary
vocab_size = vectorizer.vocabulary_size()
vocab_size

2025-05-05 00:06:19.146912: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 459694 of 1600000
2025-05-05 00:06:39.146241: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 483535 of 1600000
2025-05-05 00:06:59.147111: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 509269 of 1600000
2025-05-05 00:07:19.146350: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 539087 of 1600000
2025-05-05 00:07:39.147164: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up shuffle buffer (this may take a while): 557258 of 1600000
2025-05-05 00:07:59.146013: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:24: Filling up sh

KeyboardInterrupt: 

In [None]:
train_dataset_vectorized = train_dataset.map(lambda x,y: (vectorizer(x), y))
validation_dataset_vectorized = validation_dataset.map(lambda x,y: (vectorizer(x), y))

In [None]:
glove_file = '../data/glove.6B.100d.txt'

# Initialize an empty embeddings index dictionary
glove_embeddings = {}

# Read file and fill glove_embeddings with its contents
with open(glove_file) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = coefs

In [None]:
word_index = {x:i for i,x in enumerate(vectorizer.get_vocabulary())}

In [None]:
embeddings_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
# Iterate all of the words in the vocabulary and if the vector representation for 
# each word exists within GloVe's representations, save it in the embeddings_matrix array
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

### 建模

In [None]:
from keras import regularizers

def create_model(vocab_size, pretrained_embeddings):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(None,)),
        tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[pretrained_embeddings]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
            16,  # 减少单元数
            return_sequences=True,
            dropout=0.3,
            recurrent_dropout=0.3,
            kernel_regularizer=regularizers.l2(0.01)
        )),
        tf.keras.layers.Dropout(0.6),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
            8,  # 减少单元数
            dropout=0.3,
            recurrent_dropout=0.3)
        ),
        tf.keras.layers.Dropout(0.6),
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = create_model(vocab_size, embeddings_matrix)
history = model.fit(
	train_dataset_vectorized, 
	epochs=20, 
	validation_data=validation_dataset_vectorized
)

### 评估

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

# Get number of epochs
epochs = range(len(acc))

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle('Training and validation performance')

for i, (data, label) in enumerate(zip([(acc, val_acc), (loss, val_loss)], ["Accuracy", "Loss"])):
    ax[i].plot(epochs, data[0], 'r', label="Training " + label)
    ax[i].plot(epochs, data[1], 'b', label="Validation " + label)
    ax[i].legend()
    ax[i].set_xlabel('epochs')