# Amazon Reviews with SVM
https://www.kaggle.com/bittlingmayer/amazonreviews

## Preprocessing

In [3]:
import zipfile
zip_ref = zipfile.ZipFile('./amazonreviews.zip', 'r')
zip_ref.extractall('./input')
zip_ref.close()

In [1]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
print(os.listdir("./input"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [4]:
train_file = bz2.BZ2File('./input/train.ft.txt.bz2')
test_file = bz2.BZ2File('./input/test.ft.txt.bz2')

train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

del train_file, test_file

### Parsing Text

In [5]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
                                                       
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

del train_file_lines, test_file_lines
gc.collect()

0

### Tokenizing the Strings

In [6]:
max_features = 20000
maxlen = 100

from keras.preprocessing import text, sequence

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_sentences)
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

Using TensorFlow backend.


### Save Data

In [13]:
np.save('./data/X_train.npy', X_train)
np.save('./data/X_test.npy', X_test)

In [20]:
with open("train_labels.txt", "wb") as fp:   
    pickle.dump(train_labels, fp)

In [21]:
with open("test_labels.txt", "wb") as fp:   
    pickle.dump(test_labels, fp)

## PCA

In [7]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
import pickle
pca_pickle = pickle.dumps(pca)

In [9]:
with open('pca.pickle', 'wb') as handle:
    pickle.dump(pca_pickle, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
print(pca.explained_variance_ratio_)

[0.04613539 0.02262788 0.01926799 0.0155462  0.0140898  0.01357794
 0.01328014 0.01307882 0.0129339  0.01287642 0.01287076 0.01283081
 0.01278237 0.01276266 0.01270188 0.01266341 0.01264085 0.01257985
 0.01257123 0.01246616 0.01244786 0.01240732 0.01234973 0.01230171
 0.0122542  0.01219278 0.01214587 0.01204022 0.01202794 0.01193335
 0.01192263 0.01182904 0.01174967 0.01171773 0.01161138 0.01159511
 0.01147512 0.0113227  0.0112253  0.01109778 0.01104041 0.01093852
 0.01080225 0.0107271  0.01059973 0.01047585 0.01033959 0.01023544
 0.01010517 0.00996293 0.00984386 0.0097303  0.00958001 0.00943182
 0.00935602 0.00922877 0.00907238 0.00895314 0.008857   0.0087271
 0.00860544 0.00848885 0.00834924 0.00821025 0.00808963 0.00799395
 0.00787149 0.00774483 0.00761908 0.00748702 0.0074027  0.00724897
 0.00716123 0.00700688 0.00688127 0.00681126 0.00670296 0.00655537
 0.00644768 0.00634899 0.00625355 0.00612975 0.00601842 0.00593245
 0.00580479 0.00571289 0.00562016 0.00549369 0.00537668 0.00528

### PCA Applied

In [14]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [15]:
np.save('./data/X_train_pca.npy', X_train_pca)
np.save('./data/X_test_pca.npy', X_test_pca)