# Preprocessing

In [1]:
import pandas as pd
from library.sb_utils import save_file
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/ice_cream_data.csv', index_col=0)

In [3]:
data.head(3)

Unnamed: 0,brand,key,author,stars,text,name,description,rating,rating_count,stop_text,good_review
0,bj,0_bj,Ilovebennjerry,3,"Not enough brownies!-Super good, don't get me ...",Salted Caramel Core,Find your way to the ultimate ice cream experi...,3.7,208,"not enough brownies!-super good, get wrong. bu...",Bad
1,bj,0_bj,Sweettooth909,5,I’m OBSESSED with this pint!-I decided to try ...,Salted Caramel Core,Find your way to the ultimate ice cream experi...,3.7,208,i’m obsessed pint!-i decided try although i’m ...,Good
2,bj,0_bj,LaTanga71,3,My favorite...More Caramel Please-My caramel c...,Salted Caramel Core,Find your way to the ultimate ice cream experi...,3.7,208,my favorite...more caramel please-my caramel c...,Bad


For modelling purposes, I am only interested in `stop_text` and `good_reviews`. As a refresher `stop_text` is same as the `text` column but with all the stop words and special characters removed.

In [4]:
preprocessed_data = data[['stop_text', 'good_review']]
preprocessed_data.head(3)

Unnamed: 0,stop_text,good_review
0,"not enough brownies!-super good, get wrong. bu...",Bad
1,i’m obsessed pint!-i decided try although i’m ...,Good
2,my favorite...more caramel please-my caramel c...,Bad


## Randomize the Data

Right now, the data is ordered by brand. This ordering will impact how I do the `train_test_split`, so I will mix up the data.

In [5]:
preprocessed_data = preprocessed_data.sample(frac=1)
preprocessed_data.reset_index(inplace=True)
preprocessed_data.drop(['index'], axis=1, inplace=True)

In [6]:
preprocessed_data.head(5)

Unnamed: 0,stop_text,good_review
0,my fav-my fav ice cream ever. i could eat time...,Good
1,so good-they real chunk cookie dough chocolate...,Good
2,"nan-i personally like vanilla, addition ice cr...",Good
3,yum yum yum-this ice cream delicious! big yumm...,Good
4,"flavor awesome-i love it, i brought rose' & cr...",Good


## TF-IDF Vectorizer

I'll use a TF-IDF vectorizer to convert the textual data into numerical features. For now, I'll restritct the number of features to 1000.

In [7]:
tfidf = TfidfVectorizer()

In [8]:
tfidf = TfidfVectorizer(max_features=1000)

In [9]:
X = tfidf.fit_transform(preprocessed_data['stop_text']).toarray()
y = preprocessed_data['good_review']

## Split Into Test and Training Sets

I'll use a `train_test_split()` from `sklearn` to split the training and testing data 70-30. 

In [10]:
X = pd.DataFrame(X, columns=tfidf.get_feature_names())

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [12]:
tfidf_df = pd.DataFrame(X_train, columns=tfidf.get_feature_names())

## Save Data

Now, I'l' just save the training and testing data so I can use it for modelling.

In [13]:
# save the data to a new csv file
datapath = '../data'
save_file(preprocessed_data, 'preprocessed_data.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/preprocessed_data.csv"


In [14]:
save_file(X_test, 'X_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/X_test.csv"


In [15]:
save_file(X_train, 'X_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/X_train.csv"


In [16]:
save_file(y_train, 'y_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/y_train.csv"


In [17]:
save_file(y_test, 'y_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/y_test.csv"
