In [22]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
def shuffle_df(df):
    return df.sample(frac=1).reset_index(drop=True)


def get_split(df, train_size=0.5, test_size=0.4):
    train, rest = train_test_split(df, train_size=train_size)
    test, val = train_test_split(rest, train_size=test_size/train_size)
    return train, test, val


def train_test_val(df, label_name, labels, train_size=0.5, test_size=0.4):
    df_train, df_test, df_val = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for lab in labels:
        df_lab = df[df[label_name]==lab]
        tr, te, val = get_split(df_lab, train_size, test_size)
        df_train = df_train.append(tr)
        df_test = df_test.append(te)
        df_val = df_val.append(val)
        
    df_train['split'] = 'train'
    df_test['split'] = 'test'
    df_val['split'] = 'val'
    df = pd.DataFrame()
    df = df.append(shuffle_df(df_train))
    df = df.append(shuffle_df(df_test))
    df = df.append(shuffle_df(df_val))
    print(df)
    return df

In [24]:
cwd = os.getcwd()
root = os.path.join(cwd, '..')
imdb_file_path = os.path.join(root, 'datasets/IMDB.csv')

df = pd.read_csv(imdb_file_path)
df = train_test_val(df, 'sentiment', ['positive', 'negative'])

                                                 review sentiment  split
0     This movie looked good - good cast, evergreen ...  negative  train
1     I caught this movie late night on TV, and was ...  positive  train
2     This early film from director Bob Clark ("Pork...  negative  train
3     DIRTY WAR <br /><br />Aspect ratio: 1.78:1<br ...  negative  train
4     Only the Antichrist could have been behind suc...  negative  train
...                                                 ...       ...    ...
4995  A simple comment...<br /><br />What can I say....  positive    val
4996  Every Sunday is an eleven minute short subject...  positive    val
4997  Night of the Comet starts as the world prepare...  negative    val
4998  Saying this movie is extremely hard to follow ...  negative    val
4999  The female cast of this movie is terrific: you...  negative    val

[50000 rows x 3 columns]


In [25]:
from preprocessing import (denoise_text, remove_special_characters, remove_stopwords)

def imdb_preprocess(text, size=200):
    text = text.lower()
    text = denoise_text(text)
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    tokens = text.split(' ')
    end = min(len(tokens), size)
    text = ' '.join(tokens[0:end])
    return text

df['review'] = df['review'].apply(lambda x: imdb_preprocess(x))

In [29]:
save_path = os.path.join(root, 'datasets/imdb_clean_split.csv')
df.to_csv(save_path)

In [30]:
df = pd.read_csv(save_path)
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,split
0,0,movie looked good good cast evergreen topic ex...,negative,train
1,1,caught movie late night tv expecting lowbudget...,positive,train
2,2,early film director bob clark porkys black chr...,negative,train
3,3,dirty war aspect ratio 1781sound format stereo...,negative,train
4,4,antichrist could behind disaster one hopes iro...,negative,train
