# Import functions from splocked/embedding.py

The objective of this notebook is to test if the preprocessing functions work

In [1]:
from splocked.embedding import embed, boolean_to_binary_array, convert_sentences, word_to_id
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
# Load the raw dataset
df = pd.read_json('./../../raw_data/IMDB_reviews.json', lines=True)

# Create a smaller 1000 sample dataset with a 50/50 balance
t_df = df[df['is_spoiler']==True].iloc[:500]
f_df = df[df['is_spoiler']==False].iloc[:500]
small_df = pd.concat([t_df, f_df], ignore_index=True)

# Create a single column from the title of the review_summary and review_text as review
small_df['review'] = small_df['review_summary'] + ' ' + small_df['review_text']

# Filter only 'is_spoiler' and 'review' columns
small_df = small_df[['is_spoiler', 'review']]

# Convert boolean to binary the 'is_spoiler' function
small_df['is_spoiler'] = boolean_to_binary_array(small_df['is_spoiler'])
small_df.head()

Unnamed: 0,is_spoiler,review
0,1,A classic piece of unforgettable film-making. ...
1,1,Simply amazing. The best film of the 90's. The...
2,1,The best story ever told on film I believe tha...
3,1,"Busy dying or busy living? **Yes, there are SP..."
4,1,"Great story, wondrously told and acted At the ..."


## Split into X_train and X_test

In [3]:
# Split the data and convert y input to np.array
X_train, X_test, y_train, y_test = train_test_split(small_df['review'], np.array(small_df['is_spoiler']), test_size=0.3)

## Preprocess using package functions

In [4]:
#make a word_to_id dict
word_dict = word_to_id(X_train)

In [5]:
X_train_em = embed(X_train, word_dict)
X_test_em = embed(X_test, word_dict)

In [6]:
X_train_em.shape

(700, 981)