In [5]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
import random
pd.options.display.float_format = '{:,}'.format
random.seed(2021)

### Data Load

In [6]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)

In [7]:
# review filtering
df_review = df_raw.groupby('movie_id').count().sort_values(by='user_id', ascending = False)
df_review = df_review[df_review['review_date'] > 100]

In [8]:
df_list = df_review.index.values.tolist()

In [9]:
df_raw = df_raw[df_raw['movie_id'].isin(df_list)]

In [10]:
df_raw.to_csv('IMDB/dataset/review_raw.csv')

In [11]:
df_raw['is_spoiler']= df_raw['is_spoiler'].replace([True,False],[1,0])
df_raw.rename(columns={'is_spoiler':'class', 'review_text':'text'}, inplace = True)
df = df_raw[['class','text']]

In [12]:
len(df)

562076

In [13]:
df.to_csv('IMDB/dataset/review.csv', index = False)

### Balancing

In [14]:
df_non = df[df['class'] == 0]
df_spoiler = df[df['class'] == 1]
print("nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))

df_non = df_non.sample(frac =.5)
print("weight-decreased nonspoiler:", len(df_non))
print("balanced total:", len(df_non)+len(df_spoiler))

nonspoiler: 413853
spoiler: 148223
weight-decreased nonspoiler: 206926
balanced total: 355149


In [15]:
# 70/10/20 train/val/test
df_train_non = df_non.sample(frac = .70)
df_non = df_non.drop(df_train_non.index)
df_train_spoiler = df_spoiler.sample(frac = .70)
df_spoiler = df_spoiler.drop(df_train_spoiler.index)

# use 33% of the remaining for validation (which is 10% of whole dataset)
df_dev_non = df_non.sample(frac = .333)
df_non = df_non.drop(df_dev_non.index)
df_dev_spoiler = df_spoiler.sample(frac = .333)
df_spoiler = df_spoiler.drop(df_dev_spoiler.index)

# use all of the remaining for test (which is 20% of whole dataset)
df_test_non = df_non
df_test_spoiler = df_spoiler

In [16]:
# merge and shuffle

df_train = pd.concat([df_train_non, df_train_spoiler])
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_dev = pd.concat([df_dev_non, df_dev_spoiler])
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

df_test = pd.concat([df_test_non, df_test_spoiler])
df_test = df_test.sample(frac=1).reset_index(drop=True)

print("train:",len(df_train))
print("dev:",len(df_dev))
print("test:",len(df_test))
print("all:",len(df_train)+len(df_dev)+len(df_test))

train: 248604
dev: 35480
test: 71065
all: 355149


In [17]:
df_train.to_csv('IMDB/dataset/train.csv', index=False)
df_dev.to_csv('IMDB/dataset/dev.csv', index=False)
df_test.to_csv('IMDB/dataset/test.csv', index=False)

### Sample

In [18]:
df_train_sample = df_train.sample(35000)
df_dev_sample = df_dev.sample(5000)
df_test_sample = df_test.sample(10000)

In [19]:
df_train_sample.to_csv('IMDB/sample/train.csv', index=False)
df_dev_sample.to_csv('IMDB/sample/dev.csv', index=False)
df_test_sample.to_csv('IMDB/sample/test.csv', index=False)