In [1]:
import os
import random
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tag import pos_tag
import re

## Data Processing

In [2]:
def loadDataset(data_dir):
    data = {}
    for partion in ["train","test"]:
        data[partion] = []
        for sentiment in ["neg","pos"]:
            label = 1 if sentiment == 'pos' else 0
            path = os.path.join(data_dir,partion,sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path,f_name),'r',encoding='gbk',errors='ignore') as f:
                    review = f.read()
                    data[partion].append([review,label])
                    
    # for comparision
    random.seed(11) 
    random.shuffle(data['train'])
    random.shuffle(data['test'])

    data['train'] = pd.DataFrame(data['train'],columns=['text','sentiment'])
    data['test'] = pd.DataFrame(data['test'],columns=['text','sentiment'])
    return data['train'],data['test']

In [3]:
data_dir ='IMDB_data/aclImdb/'
train_data,test_data = loadDataset(data_dir)

print(train_data)

                                                    text  sentiment
0      Yesterday my Spanish / Catalan wife and myself...          1
1      The 60s (1999) D: Mark Piznarski. Josh Hamilto...          0
2      HUSBANDS BEWARE is a remake of the Shemp class...          1
3      No plot, crappy acting, and pointless gore.......          0
4      Recap: Doctor Markov has developed a new theor...          0
...                                                  ...        ...
24995  Grand epic as it is, Kenneth Branagh's monumen...          1
24996  Nina Foch insists that "My Name is Julia Ross"...          1
24997  Although I was in this movie playing the part ...          1
24998  It's hard to imagine a director capable of suc...          1
24999  `Castle of Blood' (aka `Castle of Terror') is ...          1

[25000 rows x 2 columns]


## Data Cleaning

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# 下载停用词列表
import nltk

def clean_text(text):
    # 去除HTML标签
    clean_html = re.sub(r'<.*?>', '', text)
    
    # 去除非字母字符和数字，并转换为小写
    clean_text = re.sub(r'[^a-zA-Z0-9]', ' ', clean_html).lower()
    
    # 去除标点符号
    clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))
    
    # 分词
    tokens = word_tokenize(clean_text)
    
    # 去除包含数字的标记
    tokens = [token for token in tokens if not any(c.isdigit() for c in token)]
    
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # 将分词后的结果用空格连接
    cleaned_text = ' '.join(filtered_tokens)
    
    return cleaned_text

train_data['text'] = train_data['text'].apply(clean_text)

train_data

Unnamed: 0,text,sentiment
0,yesterday spanish catalan wife saw emotional l...,1
1,mark piznarski josh hamilton julia stiles jerr...,0
2,husbands beware remake shemp classic brideless...,1
3,plot crappy acting pointless gore supposed hor...,0
4,recap doctor markov developed new theory produ...,0
...,...,...
24995,grand epic kenneth branagh monumental renderin...,1
24996,nina foch insists name julia ross film noir al...,1
24997,although movie playing part sheriff hodges sti...,1
24998,hard imagine director capable godawful crap no...,1


## Tokenize

In [5]:
# 创建一个新的列，用来存储标记
train_data['tokens'] = train_data['text'].apply(lambda x: x.split())  # 假设文本已经被分割成标记

print(train_data[['text', 'tokens']].head())


                                                text  \
0  yesterday spanish catalan wife saw emotional l...   
1  mark piznarski josh hamilton julia stiles jerr...   
2  husbands beware remake shemp classic brideless...   
3  plot crappy acting pointless gore supposed hor...   
4  recap doctor markov developed new theory produ...   

                                              tokens  
0  [yesterday, spanish, catalan, wife, saw, emoti...  
1  [mark, piznarski, josh, hamilton, julia, stile...  
2  [husbands, beware, remake, shemp, classic, bri...  
3  [plot, crappy, acting, pointless, gore, suppos...  
4  [recap, doctor, markov, developed, new, theory...  


## Word Embedding

In [6]:
from gensim.models import KeyedVectors

# 加载FastText预训练的词嵌入模型
word2vec_model = KeyedVectors.load_word2vec_format('/Users/lemon/Desktop/IMDB_sentiment_analysis/word2Vec/wiki-news-300d-1M.vec', binary=False)


# 假设 train_data 中有 'tokens' 列，存储了标记化的文本
# 并且已经进行了分词操作，得到了标记列表

# 将标记列表中的每个标记转换为词嵌入向量
word_embeddings = []
for tokens in train_data['tokens']:
    embeddings = [word2vec_model[token] if token in word2vec_model else None for token in tokens]
    word_embeddings.append(embeddings)

# 创建一个新的列来存储词嵌入向量
train_data['word_embeddings'] = word_embeddings

# 打印展示部分数据，包括原始文本、标记和词嵌入向量
print(train_data[['text', 'tokens', 'word_embeddings']].head())


                                                text  \
0  yesterday spanish catalan wife saw emotional l...   
1  mark piznarski josh hamilton julia stiles jerr...   
2  husbands beware remake shemp classic brideless...   
3  plot crappy acting pointless gore supposed hor...   
4  recap doctor markov developed new theory produ...   

                                              tokens  \
0  [yesterday, spanish, catalan, wife, saw, emoti...   
1  [mark, piznarski, josh, hamilton, julia, stile...   
2  [husbands, beware, remake, shemp, classic, bri...   
3  [plot, crappy, acting, pointless, gore, suppos...   
4  [recap, doctor, markov, developed, new, theory...   

                                     word_embeddings  
0  [[0.0062, 0.0206, 0.0599, 0.0042, -0.0157, -0....  
1  [[-0.1152, -0.0489, 0.149, -0.1368, -0.0333, 0...  
2  [[0.1553, -0.2057, -0.136, 0.0157, 0.0691, 0.0...  
3  [[0.0613, -0.0496, -0.106, -0.0673, 0.0807, -0...  
4  [[-0.1088, 0.0718, 0.0141, 0.0826, 0.1647, -0...

## Simple LSTM model

In [7]:
train_data

Unnamed: 0,text,sentiment,tokens,word_embeddings
0,yesterday spanish catalan wife saw emotional l...,1,"[yesterday, spanish, catalan, wife, saw, emoti...","[[0.0062, 0.0206, 0.0599, 0.0042, -0.0157, -0...."
1,mark piznarski josh hamilton julia stiles jerr...,0,"[mark, piznarski, josh, hamilton, julia, stile...","[[-0.1152, -0.0489, 0.149, -0.1368, -0.0333, 0..."
2,husbands beware remake shemp classic brideless...,1,"[husbands, beware, remake, shemp, classic, bri...","[[0.1553, -0.2057, -0.136, 0.0157, 0.0691, 0.0..."
3,plot crappy acting pointless gore supposed hor...,0,"[plot, crappy, acting, pointless, gore, suppos...","[[0.0613, -0.0496, -0.106, -0.0673, 0.0807, -0..."
4,recap doctor markov developed new theory produ...,0,"[recap, doctor, markov, developed, new, theory...","[[-0.1088, 0.0718, 0.0141, 0.0826, 0.1647, -0...."
...,...,...,...,...
24995,grand epic kenneth branagh monumental renderin...,1,"[grand, epic, kenneth, branagh, monumental, re...","[[-0.0236, -0.0583, -0.0637, 0.0067, 0.1177, -..."
24996,nina foch insists name julia ross film noir al...,1,"[nina, foch, insists, name, julia, ross, film,...","[[-0.0362, 0.0999, 0.0323, -0.126, -0.033, 0.0..."
24997,although movie playing part sheriff hodges sti...,1,"[although, movie, playing, part, sheriff, hodg...","[[0.0916, -0.0309, -0.0174, 0.1009, -0.1478, -..."
24998,hard imagine director capable godawful crap no...,1,"[hard, imagine, director, capable, godawful, c...","[[-0.1645, -0.1101, -0.3306, 0.0497, -0.0742, ..."


In [52]:
# 准备训练数据
X = train_data['word_embeddings'].tolist()  # 将词嵌入列表转换为二维列表
y = train_data['sentiment'].values

# 数据集划分
train_size = int(0.8 * len(X))
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

# 定义LSTM模型参数
input_dim = X_train[0][0].shape[0]  # 词嵌入维度
hidden_dim = 128  # LSTM隐藏层维度
output_dim = 1  # 输出维度

# 初始化权重和偏置
weights = {
    'input_to_hidden': np.random.randn(input_dim, hidden_dim),
    'hidden_to_output': np.random.randn(hidden_dim, output_dim),
}
bias = {
    'hidden': np.zeros(hidden_dim),
    'output': np.zeros(output_dim),
}

# 激活函数
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# 训练LSTM模型

# LSTM模型参数
learning_rate = 0.001
epochs = 10

for epoch in range(epochs):
    total_loss = 0
    
    for i in range(len(X_train)):
        seq_len = len(X_train[i])
        hidden_states = np.zeros((seq_len, hidden_dim))
        hidden_activations = np.zeros((seq_len, hidden_dim))
        
        # 填充 None 的词嵌入向量为零向量
        filled_embeddings = [embedding if embedding is not None else np.zeros(input_dim) for embedding in X_train[i]]
        
        # 前向传播
        for t in range(seq_len):
            hidden_states[t] = np.dot(filled_embeddings[t], weights['input_to_hidden']) + bias['hidden']
            hidden_activations[t] = sigmoid(hidden_states[t])
            
        output = np.dot(hidden_activations[-1], weights['hidden_to_output']) + bias['output']
        output_activation = sigmoid(output)
        
        # 计算损失
        loss = (output_activation - y_train[i]) ** 2
        total_loss += loss
        
        # 反向传播
        output_error = output_activation - y_train[i]
        output_delta = output_error * output_activation * (1 - output_activation)
        
        hidden_error = np.dot(output_delta, weights['hidden_to_output'].T)
        hidden_delta = hidden_error * hidden_activations[-1] * (1 - hidden_activations[-1])
        
        # 更新权重和偏置
        weights['hidden_to_output'] -= learning_rate * np.outer(hidden_activations[-1], output_delta)
        
        for t in reversed(range(seq_len)):
            if t > 0:
                hidden_grad = hidden_delta * hidden_activations[t] * (1 - hidden_activations[t])
                weights['input_to_hidden'] -= learning_rate * np.outer(filled_embeddings[t], hidden_grad)
                hidden_delta = np.dot(weights['input_to_hidden'],hidden_grad) * hidden_delta * (1 - hidden_activations[t-1]**2)
            else:
                weights['input_to_hidden'] -= learning_rate * np.outer(filled_embeddings[t], hidden_delta)
        
        bias['output'] -= learning_rate * output_delta
        bias['hidden'] -= learning_rate * hidden_delta
    
    avg_loss = total_loss / len(X_train)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

print("Training finished.")


# 预测与评估
val_loss = 0
correct_predictions = 0

for i in range(len(X_val)):
    seq_len = len(X_val[i])
    hidden_states_val = np.zeros((seq_len, hidden_dim))
    hidden_activations_val = np.zeros((seq_len, hidden_dim))
    
    # 填充 None 的词嵌入向量为零向量
    filled_embeddings_val = [embedding if embedding is not None else np.zeros(input_dim) for embedding in X_val[i]]
    
    for t in range(seq_len):
        hidden_states_val[t] = np.dot(filled_embeddings_val[t], weights['input_to_hidden']) + bias['hidden']
        hidden_activations_val[t] = sigmoid(hidden_states_val[t])
        
    output_val = np.dot(hidden_activations_val[-1], weights['hidden_to_output']) + bias['output']
    output_activation_val = sigmoid(output_val)

    val_loss += (output_activation_val - y_val[i]) ** 2
    prediction = 1 if output_activation_val > 0.5 else 0
    if prediction == y_val[i]:
        correct_predictions += 1

avg_val_loss = val_loss / len(X_val)
accuracy = correct_predictions / len(X_val)

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Accuracy: {accuracy:.2%}")

  if sys.path[0] == '':


ValueError: setting an array element with a sequence.