# IMDB电影评论数据集

In [1]:
import re

### 数据集介绍  
| id       | sentiment | review              |
| -------- | --------- | ------------------- |
| "5814_8" | 1         | "With all ..."      |
| ...      | ...       | ...                 |
| "6244_8" | 1         | "Every country ..." |  

1.这是一个文本格式文件，tsv，表示tab分隔的文本文件，类似csv（逗号分隔符）   
2.文件第一行是数据的标签，id表示评论人的id号，sentiment表示这个人的情感（只有0和1两种情感），  
3.review表示这个人对电影的评论，是一个字符串  
4.整个文件一共25001行，除了第一行，下面的数据每一行都是一个样本   
5.一条样本的三个数据id，sentiment，review是以tab为分隔的，前面已经说过  

### 第一步：读取数据集  
处理后的数据格式：列表  
[(句子1，情感1)，（句子2，情感2），...（句子25000，情感25000）]  

In [2]:
def IMDB_loader(path):
    with open(path, "r", encoding="utf-8") as f:
        f.readline()  # 第一行没用，去掉
        data = []  # 用于存放每个样本
        for line in f:  # 取出后面的每一行
            # line=[id    sentiment    review]
            line = line.split("\t", maxsplit=2)  # 按制表符tab分隔每一行数据，返回一个列表
            # 处理后：line=[id,label,sentence]
            line = (line[2], line[1]) # 取出句子和标签，组成元组
            # line=(sentence,label)
            data.append(line)
            # data=[样本1，样本2，...样本25000]
    return data

In [3]:
## test 
# 打印第12个样本
IMDB=IMDB_loader("dataset/imdb.tsv")
IMDB[12]

('"\\"Mr. Harvey Lights a Candle\\" is anchored by a brilliant performance by Timothy Spall.<br /><br />While we can predict that his titular morose, up tight teacher will have some sort of break down or catharsis based on some deep down secret from his past, how his emotions are unveiled is surprising. Spall\'s range of feelings conveyed is quite moving and more than he usually gets to portray as part of the Mike Leigh repertory.<br /><br />While an expected boring school bus trip has only been used for comic purposes, such as on \\"The Simpsons,\\" this central situation of a visit to Salisbury Cathedral in Rhidian Brook\'s script is well-contained and structured for dramatic purposes, and is almost formally divided into acts.<br /><br />We\'re introduced to the urban British range of racially and religiously diverse kids (with their uniforms I couldn\'t tell if this is a \\"private\\" or \\"public\\" school), as they gather \x96 the rapping black kids, the serious South Asians and M

### 第二步：预处理  
1.去字符(用python字符串的replace一个一个换）  
2.切词(按空格来切)  
由于上面的步骤过于繁琐，这里使用正则表达式一步到位

In [4]:
def IMDB_preprocessing(data):
    temp = []
    for sample in data:
        patten=re.compile(r"[a-zA-Z]+")    # 匹配字母
        seq = re.findall(patten,sample[0].lower())
        temp.append((seq,int(sample[1])))  # 情感的0，1转化为整型
    return temp        

In [5]:
## test
IMDB=IMDB_preprocessing(IMDB)
IMDB[12]

(['mr',
  'harvey',
  'lights',
  'a',
  'candle',
  'is',
  'anchored',
  'by',
  'a',
  'brilliant',
  'performance',
  'by',
  'timothy',
  'spall',
  'br',
  'br',
  'while',
  'we',
  'can',
  'predict',
  'that',
  'his',
  'titular',
  'morose',
  'up',
  'tight',
  'teacher',
  'will',
  'have',
  'some',
  'sort',
  'of',
  'break',
  'down',
  'or',
  'catharsis',
  'based',
  'on',
  'some',
  'deep',
  'down',
  'secret',
  'from',
  'his',
  'past',
  'how',
  'his',
  'emotions',
  'are',
  'unveiled',
  'is',
  'surprising',
  'spall',
  's',
  'range',
  'of',
  'feelings',
  'conveyed',
  'is',
  'quite',
  'moving',
  'and',
  'more',
  'than',
  'he',
  'usually',
  'gets',
  'to',
  'portray',
  'as',
  'part',
  'of',
  'the',
  'mike',
  'leigh',
  'repertory',
  'br',
  'br',
  'while',
  'an',
  'expected',
  'boring',
  'school',
  'bus',
  'trip',
  'has',
  'only',
  'been',
  'used',
  'for',
  'comic',
  'purposes',
  'such',
  'as',
  'on',
  'the',
  'sim

### 第三步：制作词典  
在经过切词后，需要对语料进行统计，为每个词构造ID。  
一般来说，可以根据每个词在语料中出现的频次构造ID，频次越高，ID越小，便于对词典进行管理


In [6]:
def IMDB_word2id_dict(data):
    # 构造词频词典
    word_freq_dict = dict()
    for sample in data:
        for word in sample[0]:
            if word not in word_freq_dict:
                word_freq_dict[word] = 0
            word_freq_dict[word] += 1
    # 按词频由高到低排序
    word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
    
    # 按照频率构造word2id词典,前两个字符比较特殊，后面说明
    word2id_dict = {"PAD":0,"UNK":1}
    for word, freq in word_freq_dict:
        id = len(word2id_dict)
        word2id_dict[word] = id
    return word2id_dict

In [7]:
# test 
IMDB_dict = IMDB_word2id_dict(IMDB)
for item in IMDB_dict.items():
    print(item)

('PAD', 0)
('UNK', 1)
('the', 2)
('and', 3)
('a', 4)
('of', 5)
('to', 6)
('is', 7)
('br', 8)
('it', 9)
('in', 10)
('i', 11)
('this', 12)
('that', 13)
('s', 14)
('was', 15)
('as', 16)
('for', 17)
('with', 18)
('movie', 19)
('but', 20)
('film', 21)
('t', 22)
('you', 23)
('on', 24)
('not', 25)
('he', 26)
('are', 27)
('his', 28)
('have', 29)
('be', 30)
('one', 31)
('all', 32)
('at', 33)
('they', 34)
('by', 35)
('an', 36)
('who', 37)
('so', 38)
('from', 39)
('like', 40)
('there', 41)
('her', 42)
('or', 43)
('just', 44)
('about', 45)
('out', 46)
('if', 47)
('has', 48)
('what', 49)
('some', 50)
('good', 51)
('can', 52)
('more', 53)
('she', 54)
('when', 55)
('very', 56)
('up', 57)
('time', 58)
('no', 59)
('even', 60)
('my', 61)
('would', 62)
('which', 63)
('story', 64)
('only', 65)
('really', 66)
('see', 67)
('their', 68)
('had', 69)
('we', 70)
('were', 71)
('me', 72)
('well', 73)
('than', 74)
('much', 75)
('get', 76)
('bad', 77)
('been', 78)
('people', 79)
('will', 80)
('do', 81)
('other', 82

### 第四步：构造数据集  
把数据集里面的句子，["单词","单词","单词"]变成[数字，数字，数字] 

In [8]:
def IMDBDateset(data,word2id_dict):
    seqs = []
    labels = []
    for sample in data:
        seq=sample[0]
        label=sample[1]
        seq = [word2id_dict.get(word,word2id_dict["UNK"]) for word in seq]
        seqs.append(seq)
        labels.append(label)
    return seqs,labels

In [9]:
## test
dataset_x,dataset_y=IMDBDateset(IMDB,IMDB_dict)
dataset_x[12]

[438,
 4019,
 2672,
 4,
 7374,
 7,
 19885,
 35,
 4,
 520,
 237,
 35,
 3710,
 12773,
 8,
 8,
 137,
 70,
 52,
 5620,
 13,
 28,
 7690,
 14050,
 57,
 2658,
 1693,
 80,
 29,
 50,
 429,
 5,
 970,
 181,
 43,
 14562,
 445,
 24,
 50,
 910,
 181,
 974,
 39,
 28,
 493,
 88,
 28,
 1418,
 27,
 22390,
 7,
 1744,
 12773,
 14,
 2173,
 5,
 1400,
 6368,
 7,
 180,
 715,
 3,
 53,
 74,
 26,
 622,
 213,
 6,
 1962,
 16,
 173,
 5,
 2,
 1847,
 5247,
 28785,
 8,
 8,
 137,
 36,
 856,
 354,
 383,
 2603,
 1150,
 48,
 65,
 78,
 340,
 17,
 682,
 4867,
 141,
 16,
 24,
 2,
 6978,
 12,
 1342,
 886,
 5,
 4,
 1966,
 6,
 32301,
 17148,
 10,
 37418,
 15106,
 14,
 227,
 7,
 73,
 3847,
 3,
 8174,
 17,
 888,
 4867,
 3,
 7,
 219,
 32302,
 7375,
 84,
 1404,
 8,
 8,
 70,
 151,
 1697,
 6,
 2,
 2569,
 688,
 2173,
 5,
 15695,
 3,
 16377,
 6700,
 347,
 18,
 68,
 7072,
 11,
 422,
 22,
 370,
 47,
 12,
 7,
 4,
 1924,
 43,
 1030,
 383,
 16,
 34,
 5151,
 2,
 14563,
 324,
 347,
 2,
 611,
 1197,
 11735,
 3,
 6979,
 2,
 425,
 8655,
 3,
 379

### 第五步：序列做等长处理 
规定一个max_seq_len，比它长的全部截断，比它短的全部补pad

In [10]:
def IMDB_cutandpad(data,max_seq_len=256):
    seqs,labels=data[0],data[1]   
    # 截断
    temp=[]
    for seq in seqs:        
        seq=seq[:max_seq_len]
        temp.append(seq)
    # padding
    temp2=[]
    for seq in temp:
        seq=seq+[0]*(max_seq_len-len(seq)) # 默认pad字符为0
        temp2.append(seq)
    return temp2,labels

In [11]:
## test
new_seqs,new_labels=IMDB_cutandpad((dataset_x,dataset_y))
new_seqs[:5]

[[18,
  32,
  12,
  530,
  170,
  181,
  33,
  2,
  547,
  18,
  8965,
  11,
  139,
  636,
  2601,
  6,
  28,
  224,
  149,
  2,
  1010,
  645,
  128,
  3,
  41,
  294,
  2,
  18843,
  3,
  294,
  11399,
  174,
  279,
  11,
  44,
  182,
  6,
  76,
  4,
  795,
  2602,
  84,
  12,
  226,
  37,
  11,
  196,
  15,
  66,
  628,
  10,
  2,
  4221,
  44,
  6,
  279,
  96,
  57,
  61,
  328,
  712,
  26,
  7,
  2477,
  43,
  1332,
  11399,
  7,
  173,
  4973,
  173,
  769,
  21,
  63,
  11,
  372,
  170,
  6,
  67,
  33,
  2,
  424,
  55,
  9,
  15,
  1804,
  616,
  50,
  5,
  9,
  48,
  1280,
  3410,
  45,
  8965,
  14,
  538,
  930,
  2,
  3487,
  3,
  83,
  2,
  568,
  731,
  5,
  1641,
  27,
  77,
  142,
  4570,
  8,
  8,
  1994,
  1135,
  20,
  5,
  260,
  12,
  7,
  32,
  45,
  467,
  1581,
  38,
  879,
  23,
  2566,
  40,
  8965,
  10,
  545,
  94,
  23,
  27,
  170,
  6,
  771,
  12,
  3,
  169,
  9,
  354,
  50,
  200,
  670,
  8965,
  36,
  26221,
  17,
  26222,
  6,
  2,
  229,
  5,

### 第六步：将上述打包成一个类

In [12]:
class IMDB_DataLoader:
    def __init__(self):
        self.data=None
        self.dict=None
    def loader(self,path):
        IMDB=self.IMDB_loader(path)
        IMDB=self.IMDB_preprocessing(IMDB)
        IMDB_dict = self.IMDB_word2id_dict(IMDB)
        IMDB=self.IMDBDateset(IMDB,IMDB_dict)
        self.data=self.IMDB_cutandpad(IMDB)
        self.dict=IMDB_dict
        return self.data
    
    @staticmethod
    def IMDB_loader(path):
        with open(path, "r", encoding="utf-8") as f:
            f.readline()  # 第一行没用，去掉
            data = []  # 用于存放每个样本
            for line in f:  # 取出后面的每一行
                # line=[id    sentiment    review]
                line = line.split("\t", maxsplit=2)  # 按制表符tab分隔每一行数据，返回一个列表
                # 处理后：line=[id,label,sentence]
                line = (line[2], line[1]) # 取出句子和标签，组成元组
                # line=(sentence,label)
                data.append(line)
                # data=[样本1，样本2，...样本25000]
        return data
    
    @staticmethod
    def IMDB_preprocessing(data):
        temp = []
        for sample in data:
            patten=re.compile(r"[a-zA-Z]+")    # 匹配字母
            seq = re.findall(patten,sample[0].lower())
            temp.append((seq,int(sample[1])))  # 情感的0，1转化为整型
        return temp   
    
    @staticmethod
    def IMDB_word2id_dict(data):
        # 构造词频词典
        word_freq_dict = dict()
        for sample in data:
            for word in sample[0]:
                if word not in word_freq_dict:
                    word_freq_dict[word] = 0
                word_freq_dict[word] += 1
        # 按词频由高到低排序
        word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)

        # 按照频率构造word2id词典,前两个字符比较特殊，后面说明
        word2id_dict = {"PAD":0,"UNK":1}
        for word, freq in word_freq_dict:
            id = len(word2id_dict)
            word2id_dict[word] = id
        return word2id_dict
    
    @staticmethod
    def IMDBDateset(data,word2id_dict):
        seqs = []
        labels = []
        for sample in data:
            seq=sample[0]
            label=sample[1]
            seq = [word2id_dict.get(word,word2id_dict["UNK"]) for word in seq]
            seqs.append(seq)
            labels.append(label)
        return seqs,labels
    
    @staticmethod
    def IMDB_cutandpad(data,max_seq_len=256):
        seqs,labels=data[0],data[1]   
        # 截断
        temp=[]
        for seq in seqs:        
            seq=seq[:max_seq_len]
            temp.append(seq)
        # padding
        temp2=[]
        for seq in temp:
            seq=seq+[0]*(max_seq_len-len(seq)) # 默认pad字符为0
            temp2.append(seq)
        return temp2,labels

        
    

In [13]:
## test
loader = IMDB_DataLoader()
data = loader.loader("dataset/imdb.tsv")
len(data[0][12])

256