# Text Processing

 1. Clean Data
 2. Transform Data 
 3. Create small labeled training datasets for experiment

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Set up some config parameters

DATA_PATH=Path('Data/')
DATA_PATH.mkdir(exist_ok=True)


In [3]:
train = pd.read_csv(DATA_PATH/'train.csv', header = None)
train.rename(columns = {0: 'rating', 1: 'review'}, inplace = True)

In [4]:
test = pd.read_csv(DATA_PATH/'test.csv', header = None)
test.rename(columns = {0: 'rating', 1: 'review'}, inplace = True)

In [5]:
print('Number of ratings in train set:', train.shape[0])
print ('Number of ratings in test set:', test.shape[0])
print ('Number of 5.0 ratings in train set:', train.loc[train.rating == 5].shape[0])

Number of ratings in train set: 650000
Number of ratings in test set: 50000
Number of 5.0 ratings in train set: 130000


### Clean text

In [6]:
import re
import html
#import unicodedata
import unidecode



def clean_html(text):
    """remove html div tags if there is any"""
    text = str(text)
    return re.sub('<.*?>', ' ', text)

def remove_repeats(string, n, join=True):
    count = 0
    output = []
    last = ''
    for c in string:
        if c == last:
            count = count + 1
        else:
            count = 0
            last = c
        if count < n:
            output.append(c)
    if join:
        return "".join(output)
    return output


def preprocess_text(text, front_pad='\n ', end_pad=' ', clean_markup=True,
                clean_unicode=True, encode='utf-8', limit_repeats=3):
    """
    Processes utf-8 encoded text according to the criterion specified in seciton 4 of 
    https://arxiv.org/pdf/1704.01444.pdf (Radford et al).
    Follow Kant et al. codes with some fixed-up
    """
    if clean_markup:
        text = clean_html(text)

    if clean_unicode:
        text = unidecode.unidecode(text)

    text = html.unescape(text)
    text = text.replace('\\n',"\n").replace('\\"', '"').replace(' @.@ ','.').replace(
                        ' @-@ ','-').replace('\\', ' \\ ').replace('\n',' ').strip()
    text = text.split()

    if limit_repeats > 0:
        remove_repeats(text, limit_repeats, join=False)

    text = front_pad+(" ".join(text))+end_pad

    if encode is not None:
        text = text.encode(encoding=encode)
        text = ''.join(chr(c) for c in text)

    return text

In [7]:
train['cleaned_text'] = train.review.apply(lambda t: preprocess_text(t))
test['cleaned_text'] = test.review.apply(lambda t: preprocess_text(t))

In [8]:
#Shuffer the reviews
train2 = train.sample(frac=1, random_state= 42 ).reset_index(drop=True)
test2 = test.sample(frac = 1, random_state= 42).reset_index(drop = True)

In [9]:
# Save preprocessed text to disk

train_clean = train2[['rating','cleaned_text']]
test_clean = test2[['rating','cleaned_text']]

train_clean.to_csv(DATA_PATH/'train_cleaned.csv', header=False, index=False )
test_clean.to_csv(DATA_PATH/'test_cleaned.csv', header=False, index=False )

### Transform labels

In [11]:
#Load data
train_clean = pd.read_csv(DATA_PATH/'train_cleaned.csv', header = None)
train_clean.rename(columns = {0: 'label', 1: 'text'}, inplace = True)
test_clean = pd.read_csv(DATA_PATH/'test_cleaned.csv', header = None)
test_clean.rename(columns = {0: 'label', 1: 'text'}, inplace = True)

In [12]:
#Create onehot encoding columns for rating
label_col = pd.get_dummies(train_clean['label'], prefix = 'rating')

train_clean2 = train_clean.copy()
train_clean2 = train_clean2.drop('label', axis =1  )
train_clean2 = train_clean2.join(label_col)

In [13]:
#Repeat one hot encoding for test data
label_col_test = pd.get_dummies(test_clean['label'], prefix = 'rating')

test_clean2 = test_clean.copy()
test_clean2 = test_clean2.drop('label', axis =1  )
test_clean2 = test_clean2.join(label_col_test)

In [14]:
# Save data to disk
train_clean2.to_csv(DATA_PATH/'train_onehot.csv', index = False)
test_clean2.to_csv(DATA_PATH/'test_onehot.csv', index = False)

### Create subsets of data for experiments


In [15]:
# Load data
train_clean2 = pd.read_csv(DATA_PATH/'train_onehot.csv')

In [20]:
def print_group(df):
    print('#rating 1:', sum(df.rating_1))
    print('#rating 2:', sum(df.rating_2))
    print('#rating 3:', sum(df.rating_3))
    print('#rating 4:', sum(df.rating_4))
    print('#rating 5:', sum(df.rating_5))
    

In [16]:
#50 examples
train_50 = train_clean2.sample(frac=1/13000, random_state= 123 ).reset_index(drop=True)
train_50.shape

(50, 6)

In [21]:
print_group(train_50)


#rating 1: 9
#rating 2: 10
#rating 3: 12
#rating 4: 14
#rating 5: 5


In [22]:
#100 examples
train_100 = train_clean2.sample(frac=1/6500, random_state= 10000).reset_index(drop=True)
train_100.shape

(100, 6)

In [23]:
print_group(train_100)

#rating 1: 29
#rating 2: 10
#rating 3: 13
#rating 4: 21
#rating 5: 27


In [24]:
#500 examples
train_500 = train_clean2.sample(frac=1/1300, random_state= 421).reset_index(drop=True)
train_500.shape

(500, 6)

In [25]:
print_group(train_500)

#rating 1: 105
#rating 2: 94
#rating 3: 76
#rating 4: 116
#rating 5: 109


In [31]:
#1000 examples
train_1000 = train_clean2.sample(frac=1/650, random_state= 4222).reset_index(drop=True)
train_1000.shape

(1000, 6)

In [32]:
print_group(train_1000)

#rating 1: 217
#rating 2: 171
#rating 3: 221
#rating 4: 181
#rating 5: 210


In [35]:
train_clean.head()

Unnamed: 0,label,text
0,1,"\n First of all i'm not a big fan of buffet, i..."
1,2,\n Thanks Yelp. I was looking for the words to...
2,3,\n Service was so-so. They were receiving a de...
3,3,\n Stamoolis Brothers is one of the Strip Dist...
4,1,\n I want to give a 2 stars because the servic...


In [36]:
#3000 examples
train_3000 = train_clean.sample(frac=2/325, random_state= 421).reset_index(drop=True)

  # Drop 1000 examples 
idx_drop = list(train_3000[train_3000.label == 3].index[:400])
idx_drop.extend(list(train_3000[train_3000.label ==4].index[:100]))
idx_drop.extend(list(train_3000[train_3000.label == 1].index[:500]))

train_3000 = train_3000.drop(idx_drop)

#Transform back to one hot
label_col_3000 = pd.get_dummies(train_3000['label'], prefix = 'rating')

train_3000 = train_3000.drop('label', axis =1  )
train_3000 = train_3000.join(label_col_3000)

train_3000.shape

(3000, 6)

In [37]:
print_group(train_3000)

#rating 1: 326
#rating 2: 754
#rating 3: 428
#rating 4: 680
#rating 5: 812


In [38]:
#5000 examples
train_5000 = train_clean.sample(frac=3/325, random_state= 4).reset_index(drop=True)

  # Drop 1000 examples 
idx_drop = list(train_5000[train_5000.label == 3].index[:400])
idx_drop.extend(list(train_5000[train_5000.label ==4].index[:100]))
idx_drop.extend(list(train_5000[train_5000.label == 1].index[:500]))

train_5000 = train_5000.drop(idx_drop)

#Transform back to one hot
label_col_5000 = pd.get_dummies(train_5000['label'], prefix = 'rating')

train_5000 = train_5000.drop('label', axis =1  )
train_5000 = train_5000.join(label_col_5000)

train_5000.shape

(5000, 6)

In [40]:
print_group(train_5000)

#rating 1: 681
#rating 2: 1205
#rating 3: 776
#rating 4: 1129
#rating 5: 1209


In [41]:
# Save all subsets to disk
train_50.to_csv(DATA_PATH/'experiment/train_50.csv', index=False )
train_100.to_csv(DATA_PATH/'experiment/train_100.csv', index=False )
train_500.to_csv(DATA_PATH/'experiment/train_500.csv', index=False )
train_1000.to_csv(DATA_PATH/'experiment/train_1000.csv', index=False )
train_3000.to_csv(DATA_PATH/'experiment/train_3000.csv', index=False )
train_5000.to_csv(DATA_PATH/'experiment/train_5000.csv', index=False )

## Create train and validation file

In [5]:
from sklearn.model_selection import train_test_split

def split_val_train_sets(filename):
    name = filename.split('/')[-1].split('.')[0]
    df = pd.read_csv(DATA_PATH/filename)
    #Split train and val sets
    trainset, valset = train_test_split(df, test_size=0.2, random_state=123)
    #Save to disk
    fname_train = 'experiment/'+name+'_train.csv'
    fname_val = 'experiment/'+name+'_val.csv'
    trainset.to_csv(DATA_PATH/fname_train, index = False)
    valset.to_csv(DATA_PATH/fname_val, index = False)
    
    
    

In [7]:
split_val_train_sets('experiment/train_50.csv')
split_val_train_sets('experiment/train_100.csv')
split_val_train_sets('experiment/train_500.csv')
split_val_train_sets('experiment/train_1000.csv')
split_val_train_sets('experiment/train_3000.csv')
split_val_train_sets('experiment/train_5000.csv')