In [1]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
from nltk.tokenize import sent_tokenize
import random
pd.options.display.float_format = '{:,}'.format
random.seed(42)

### Data Load

In [2]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw.shape

(573913, 7)

### Data Preprocessing

In [3]:
# review filtering
df_review = df_raw.groupby('movie_id').count().sort_values(by='user_id', ascending = False)
df_review = df_review[df_review['review_date'] > 100]

df_list = df_review.index.values.tolist()

df_raw = df_raw[df_raw['movie_id'].isin(df_list)]
df_raw.shape

(562076, 7)

In [4]:
# column rename
df_raw['is_spoiler']= df_raw['is_spoiler'].replace([True,False],[1,0])
df_raw.rename(columns={'is_spoiler':'class', 'review_text':'text'}, inplace = True)
df = df_raw[['class','text']]

In [5]:
# null replace
df['text'].replace('', np.nan, inplace=True)
df.dropna(subset=['text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [6]:
df['text'] = df['text'].apply(str)

df['text_count'] = df['text'].apply(sent_tokenize).tolist()
df['text_count'] = df['text_count'].apply(len)

df = df[df['text_count'] < 10]
df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


(272977, 3)

In [7]:
df

Unnamed: 0,class,text,text_count
1,1,The Shawshank Redemption is without a doubt on...,9
6,1,I have been a fan of this movie for a long tim...,7
7,1,I made my account on IMDb Just to Rate this mo...,8
8,1,"A friend of mine listed ""The Shawshank Redempt...",6
11,1,"To tell the truth, I am speechless. I am a you...",9
...,...,...,...
573907,0,This type of movie is one that I would not ord...,5
573908,0,"Go is wise, fast and pure entertainment. Assem...",4
573909,0,"Well, what shall I say. this one´s fun at any ...",7
573911,0,Call this 1999 teenage version of Pulp Fiction...,7


### Balancing

In [8]:
df_non = df[df['class'] == 0]
df_spoiler = df[df['class'] == 1]
print("nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))

nonspoiler: 221697
spoiler: 51280


In [9]:
df_non = df_non.sample(frac =.5)
print("weight-decreased nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))
print("balanced total:", len(df_non)+len(df_spoiler))

weight-decreased nonspoiler: 110848
spoiler: 51280
balanced total: 162128


### Holdout

In [10]:
# 70/10/20 train/val/test
df_train_non = df_non.sample(frac = .70)
df_non = df_non.drop(df_train_non.index)
df_train_spoiler = df_spoiler.sample(frac = .70)
df_spoiler = df_spoiler.drop(df_train_spoiler.index)

# use 33% of the remaining for validation (which is 10% of whole dataset)
df_dev_non = df_non.sample(frac = .333)
df_non = df_non.drop(df_dev_non.index)
df_dev_spoiler = df_spoiler.sample(frac = .333)
df_spoiler = df_spoiler.drop(df_dev_spoiler.index)

# use all of the remaining for test (which is 20% of whole dataset)
df_test_non = df_non
df_test_spoiler = df_spoiler

In [11]:
# merge and shuffle

df_train = pd.concat([df_train_non, df_train_spoiler])
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_dev = pd.concat([df_dev_non, df_dev_spoiler])
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

df_test = pd.concat([df_test_non, df_test_spoiler])
df_test = df_test.sample(frac=1).reset_index(drop=True)

print("train:",len(df_train))
print("dev:",len(df_dev))
print("test:",len(df_test))
print("all:",len(df_train)+len(df_dev)+len(df_test))

train: 113490
dev: 16197
test: 32441
all: 162128


In [12]:
df_train.to_csv('IMDB/final/train.csv', index=False)
df_dev.to_csv('IMDB/final/dev.csv', index=False)
df_test.to_csv('IMDB/final/test.csv', index=False)

### Sample

In [13]:
df_train_sample = df_train.sample(35000)
df_dev_sample = df_dev.sample(5000)
df_test_sample = df_test.sample(10000)

In [14]:
df_train_sample.to_csv('IMDB/final_sample/train.csv', index=False)
df_dev_sample.to_csv('IMDB/final_sample/dev.csv', index=False)
df_test_sample.to_csv('IMDB/final_sample/test.csv', index=False)