In [1]:
# Importing the necessary modules
import pandas as pd
import tarfile
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load in data
data_tg = tarfile.open('data/yelp_review_polarity_csv.tgz')
data_tg.extractall('data')
data_tg.close()

In [5]:
# Inspect the tg file
data_tg

<tarfile.TarFile at 0x1d80581d488>

In [10]:
# Get the train data
train_df = pd.read_csv('data/yelp_review_polarity_csv/train.csv', header=None)
print('Train shape:', train_df.shape)
train_df.head()

Train shape: (560000, 2)


Unnamed: 0,0,1
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [9]:
# Get the test data
test_df = pd.read_csv('data/yelp_review_polarity_csv/test.csv', header=None)
print('Test shape:', test_df.shape)
test_df.head()

Test shape: (38000, 2)


Unnamed: 0,0,1
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [11]:
# Change labels into 1 and 0
train_df[0] = (train_df[0]==2).astype(int)
test_df[0] = (test_df[0]==2).astype(int)

In [12]:
# Inspect train set again
train_df.head()

Unnamed: 0,0,1
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...
3,0,I'm writing this review to give you a heads up...
4,1,All the food is great here. But the best thing...


In [13]:
# Inspect test set again
test_df.head()

Unnamed: 0,0,1
0,1,"Contrary to other reviews, I have zero complai..."
1,0,Last summer I had an appointment to get new ti...
2,1,"Friendly staff, same starbucks fair you get an..."
3,0,The food is good. Unfortunately the service is...
4,1,Even when we didn't have a car Filene's Baseme...


# Prepare data for BERT

In [14]:
# Train data
df_bert = pd.DataFrame({
    'id': range(len(train_df)),
    'label': train_df[0],
    'alpha': ['a']*train_df.shape[0],
    'text': train_df[1].replace(r'\n', '', regex=True)
})

In [15]:
# Splitting training data file into *train* and *dev*
bert_train, bert_dev = train_test_split(df_bert, test_size=0.01, random_state=42)

# Inspect
bert_train.head()

Unnamed: 0,id,label,alpha,text
42237,42237,0,a,DO NOT I REPEAT DO NOT GO HERE. ..I'm not a hu...
487854,487854,0,a,Cute little place that tries to be a small-tow...
40384,40384,0,a,"Enzo used to do a good job, but then he got co..."
438779,438779,1,a,So happy this place is less then 1.5 miles fro...
286866,286866,0,a,We came here for a Christmas Day brunch. After...


In [16]:
# Test data
bert_test = pd.DataFrame({
    'id': range(len(test_df)),
    'text': test_df[1].replace(r'\n', '', regex=True)
})

# Inspect
bert_test.head()

Unnamed: 0,id,text
0,0,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,3,The food is good. Unfortunately the service is...
4,4,Even when we didn't have a car Filene's Baseme...


# Saving dataframes to .tsv format

In [17]:
bert_train.to_csv('data/train.tsv', sep='\t', index=False, header=False)
bert_dev.to_csv('data/dev.tsv', sep='\t', index=False, header=False)
bert_test.to_csv('data/test.tsv', sep='\t', index=False, header=False)