In [5]:
# imports
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [6]:
# data import
df = pd.read_csv("amazon_data.csv")
df = df.drop(columns=["asin", "imgUrl", "productURL"],axis=1)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
1,"Anker Soundcore mini, Super-Portable Bluetooth...",4.7,98099,23.99,True,0,Hi-Fi Speakers
2,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
3,"Echo Dot with clock (5th generation, 2022 rele...",4.7,7205,31.99,False,0,Hi-Fi Speakers
4,Introducing Echo Pop | Full sound compact Wi-F...,4.6,1881,17.99,False,0,Hi-Fi Speakers
...,...,...,...,...,...,...,...
2222737,"Motorbike Armour, Motorbike Clothing, Cycling ...",0.0,0,22.34,False,0,Motorbike Clothing
2222738,PROFIRST Waterproof Motorcycle 2 Piece Ladies ...,0.0,0,97.99,False,0,Motorbike Clothing
2222739,Men’s Motorcycle Motorbike Biker Trousers Kevl...,0.0,0,52.99,False,0,Motorbike Clothing
2222740,Texpeed Mens Motorcycle Motorbike Biker Trouse...,4.3,404,79.99,False,0,Motorbike Clothing


## Pre-Processing

### Shuffle rows
Rows have to be shuffled otherwise there will be issues due the test-train split. Data is currently sorted by categoryName, this will cause problems later unless shuffled before modelling.

In [7]:
index_array = np.arange(len(df))
np.random.shuffle(index_array)

df = df.iloc[index_array].reset_index(drop=True)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,Uvistat Sun Cream SPF30 125 ml,4.6,129,13.99,False,0,Skin Care
1,Underwear Men's Sexy Set Sexy Boxer Shorts Tro...,0.0,0,2.59,False,0,Sports & Outdoors
2,Heavy Duty Corner Steel Shelving Garage Rackin...,4.3,67,119.99,False,0,Storage & Home Organisation
3,8T0919603G For A4 S4 RS4 8K A5 S5 RS5 Q5 8R 8T...,0.0,0,129.98,False,0,External Optical Drives
4,"Jaragar Golf Swing Trainer, Power Flex Golf Sw...",4.2,49,33.14,False,0,Sports & Outdoors
...,...,...,...,...,...,...,...
2222737,"FOLOSAFENAR Golf Bags, Locked with a Lock Larg...",0.0,0,32.59,False,0,Sports & Outdoors
2222738,"Wrestling Shoes for Men, Non-Slip Boxing Shoes...",0.0,0,69.99,False,0,Boxing Shoes
2222739,TOYANDONA 6 Pcs children's crown headband baby...,0.0,0,20.37,False,0,Sports & Outdoors
2222740,"Scarpa Mojito Kid, Unisex Kids Trail Running S...",0.0,0,81.85,False,0,Sports & Outdoors


### Drop data
Unfortunately, my computer is unable to process the 2+ million rows of data; therefore, I will limit the data to 800,000 rows, randomly selected to prevent bias.

In [8]:
df = df.iloc[:800000,:]

### isBestSeller: bool -> int

In [9]:
dict_map = {True: 1, False: 0}
df['isBestSeller'] = df['isBestSeller'].map(dict_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['isBestSeller'] = df['isBestSeller'].map(dict_map)


### categoryName: string -> float  -- FeatureHasher

In [10]:
n_features = len(df.categoryName.unique())

In [11]:
from sklearn.feature_extraction import FeatureHasher

# Ensure everything is in string format for hashing.
categories = df.categoryName.astype(str)  

# Convert to column to list of lists for iterating w/ FeatureHasher
categories = [[category] for category in categories]

# FeatureHasher
hasher = FeatureHasher(n_features=n_features, input_type="string")
X_category = hasher.transform(categories).toarray().astype("float32")

# sparse matrix -> array -> DataFrame
hashed_df = pd.DataFrame(X_category, columns=[f"hash_{i}" for i in range(n_features)])

# Concatenate dataframes
data = pd.concat([df, hashed_df], axis=1)
data = data.drop(axis=1,columns="categoryName")

### title: string -> int -- Word2Vec

In [12]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

# Tokenize product names
product_names = df['title']
tokenized_product_names = [word_tokenize(name.lower()) for name in product_names]

# Train Word2Vec model
model_w2v = Word2Vec(sentences=tokenized_product_names, vector_size=100, window=5, min_count=1, workers=4)

# Save model
model_w2v.save('word2vec_model.model')

In [13]:
model_path = "word2vec_model.model"
model_w2v = Word2Vec.load(model_path)

# Define a function to get the word vectors for the first 5 words of a product name
def get_word_vectors(product_name):
    try:
        five_words = word_tokenize(product_name.lower())[:5]
        vectors = [model_w2v.wv[word] for word in five_words]
        
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan
    except KeyError:
        return np.nan      # Handle the case where a word is not in the vocabulary

data['average_vector'] = data['title'].apply(get_word_vectors)

# Expand the average vector into several columns
for i in range(model_w2v.vector_size):
    data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


In [14]:
numerical = data.drop(axis=1, columns=["title", "average_vector"])
numerical

Unnamed: 0,stars,reviews,price,isBestSeller,boughtInLastMonth,hash_0,hash_1,hash_2,hash_3,hash_4,...,embedding_dim_91,embedding_dim_92,embedding_dim_93,embedding_dim_94,embedding_dim_95,embedding_dim_96,embedding_dim_97,embedding_dim_98,embedding_dim_99,embedding_dim_100
0,4.6,129,13.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.512237,1.246293,1.336042,-0.499967,-0.045397,0.725670,0.649035,-0.883846,-0.192357,0.428974
1,0.0,0,2.59,0,0,0.0,0.0,0.0,0.0,0.0,...,0.381603,-0.698765,-0.646395,-0.487055,0.810207,-1.735097,-0.314623,-1.637798,-2.911309,0.460558
2,4.3,67,119.99,0,0,0.0,0.0,0.0,0.0,0.0,...,1.442481,-0.532286,1.870287,0.347537,0.159064,0.301447,-2.918429,0.566702,0.868300,-0.044470
3,0.0,0,129.98,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.623309,-0.648426,0.287849,-1.181001,-0.648739,1.003400,-0.426561,-0.084841,0.463334,0.283238
4,4.2,49,33.14,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.339577,1.559958,0.060626,1.034951,1.017364,-0.564131,-1.173586,0.468170,1.890968,0.118000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,4.7,4,4.39,0,0,0.0,0.0,0.0,0.0,0.0,...,1.815826,-0.022067,0.224914,-0.635839,-0.731933,-0.184009,-1.362830,0.130098,-1.211426,0.867083
799996,0.0,0,19.50,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.594159,-0.007387,-0.576267,0.733034,0.559051,-0.354597,-1.681033,-0.793832,0.167814,0.899996
799997,4.7,4,3.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.145289,0.502903,0.013630,-0.531748,-1.104955,-0.677679,-2.700907,1.063005,-0.750902,-0.196482
799998,0.0,0,51.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.006706,-1.237052,0.270167,1.464110,0.449700,0.008551,0.912924,-1.487889,-1.604521,0.292503


In [15]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Columns: 401 entries, stars to embedding_dim_100
dtypes: float32(396), float64(2), int64(3)
memory usage: 1.2 GB


## Data Split and Transformation

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [47]:
y = numerical.price.values
X = numerical.drop(axis=1,columns="price").values

X_train = X[0:-80000]
y_train = y[0:-80000]

X_test = X[-80000:]
y_test = y[-80000:]

In [67]:
# Log transformation
y_train_transformed = np.log1p(y_train)  

## Modelling

In [19]:
# import modules
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [20]:
model = RandomForestRegressor()
model = GradientBoostingRegressor()

In [69]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train_transformed)

y_pred_LR = LR.predict(X_test)

# Inverse-transform to return values back to original scale
y_pred_LR = np.expm1(y_pred_LR)

In [65]:
tester = pd.DataFrame({"productName": df.title.iloc[-80000:],
                        "rating": df.stars.iloc[-80000:],
                        "Reviews": df.reviews.iloc[-80000:],
                        "actualPrice": y_test,
                        "y_pred_LR": y_pred_LR})
tester

Unnamed: 0,productName,rating,Reviews,actualPrice,y_pred_LR
720000,Ravensden Soft Toy Otter Standing 45cm Eco Col...,5.0,4,18.99,14.442851
720001,"Yolev 24PCS Fruit Fly Sticky Traps, for Fruit ...",4.1,16,3.99,8.360139
720002,ultimatesalestore Multi-Compartment Shelf Grea...,4.5,317,21.49,39.041551
720003,NOAGENJT Plain Black T Shirt Mens Tshirt Summe...,0.0,0,5.99,8.287716
720004,PURPLLE Vibration Helmet TK Laryngeal Control ...,0.0,0,7.23,39.231816
...,...,...,...,...,...
799995,Raguso Mini Picture Case 3 Inch Mini Polaroid ...,4.7,4,4.39,22.149087
799996,DJMJHG seamless gym clothing for women workout...,0.0,0,19.50,19.282056
799997,Tosakey 2Pcs Air Fryer Liners Silicone Ninja A...,4.7,4,3.99,16.044297
799998,YGCLOTHES Women's Heated Vest with 3 Heating L...,0.0,0,51.99,34.692624


In [76]:
# LASSO
lassoReg = Lasso(alpha=0.1)
lassoReg.fit(X_train, y_train_transformed)
y_pred_LASSO = lassoReg.predict(X_test).astype("float32")

MemoryError: Unable to allocate 2.15 GiB for an array with shape (720000, 400) and data type float64