In [1]:
# imports
import pandas as pd, numpy as np

In [2]:
# data import
df = pd.read_csv("amazon_data.csv")
df = df.drop(columns=["asin", "imgUrl", "productURL"],axis=1)
df = df[df["reviews"] > 0]
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
1,"Anker Soundcore mini, Super-Portable Bluetooth...",4.7,98099,23.99,True,0,Hi-Fi Speakers
2,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
3,"Echo Dot with clock (5th generation, 2022 rele...",4.7,7205,31.99,False,0,Hi-Fi Speakers
4,Introducing Echo Pop | Full sound compact Wi-F...,4.6,1881,17.99,False,0,Hi-Fi Speakers
...,...,...,...,...,...,...,...
2222733,Speed MaxX BODY ARMOUR CE MOTORBIKE/MOTORCYCLE...,3.6,66,49.99,False,0,Motorbike Clothing
2222734,Motorcycle Clothing Suit - Motorbike Suit With...,4.1,5,158.99,False,0,Motorbike Clothing
2222735,GREAT BIKERS GEAR - Bobber Cafe Brat Style Lea...,3.6,12,14.99,False,0,Motorbike Clothing
2222740,Texpeed Mens Motorcycle Motorbike Biker Trouse...,4.3,404,79.99,False,0,Motorbike Clothing


## Pre-Processing

### Shuffle rows
Rows have to be shuffled otherwise there will be issues due the test-train split. Data is currently sorted by categoryName, this will cause problems later unless shuffled before modelling.

In [3]:
index_array = np.arange(len(df))
np.random.shuffle(index_array)

df = df.iloc[index_array].reset_index(drop=True)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,Studio Boketto Funny And Rude LOL 40th Birthda...,4.5,69,3.25,False,0,Office Paper Products
1,"Spear & Jackson Paving and Patio Cleaner, 1.3m...",4.0,356,16.99,False,300,Agricultural Equipment & Supplies
2,Rossignol Boy's Girl 1/2 Zip Fleece Technical ...,5.0,1,39.99,False,0,Sports & Outdoors
3,Silent Monsters Mouse Mat Small 24 x 20 cm / 1...,4.6,5715,7.99,False,0,"Keyboards, Mice & Input Devices"
4,Ring Floodlight Cam Wired Pro | Latest Model |...,5.0,2,189.00,False,0,Surveillance Cameras
...,...,...,...,...,...,...,...
1048317,Chaos Chinook Unisex Balaclava,3.0,16,19.12,False,0,Sports & Outdoors
1048318,Huxters Funny Card for Mum – Premium Quality A...,4.6,3842,3.99,False,0,Office Paper Products
1048319,Xiatiaosann RJ11 to RJ45 Ethernet Cable Adapte...,5.0,5,9.99,False,0,"Telephones, VoIP & Accessories"
1048320,BOGI 17oz Insulated Water Bottle Double Wall V...,4.5,8747,9.89,False,100,Sports & Outdoors


### isBestSeller: bool -> int

In [4]:
dict_map = {True: 1, False: 0}
df['isBestSeller'] = df['isBestSeller'].map(dict_map)

### categoryName: string -> float  -- FeatureHasher

In [5]:
n_features = len(df.categoryName.unique())

In [6]:
from sklearn.feature_extraction import FeatureHasher

# Ensure everything is in string format for hashing.
categories = df.categoryName.astype(str)  

# Convert to column to list of lists for iterating w/ FeatureHasher
categories = [[category] for category in categories]

# FeatureHasher
hasher = FeatureHasher(n_features=n_features, input_type="string")
X_category = hasher.transform(categories).toarray().astype("float32")

# sparse matrix -> array -> DataFrame
hashed_df = pd.DataFrame(X_category, columns=[f"hash_{i}" for i in range(n_features)])

# Concatenate dataframes
data = pd.concat([df, hashed_df], axis=1)
data = data.drop(axis=1,columns="categoryName")

### title: string -> int -- Word2Vec

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [8]:
model_path = "Models/word2vec_model.model"
model_w2v = Word2Vec.load(model_path)

# Define a function to get the word vectors for the first 5 words of a product name
def get_word_vectors(product_name):
    try:
        five_words = word_tokenize(product_name.lower())[:5]
        vectors = [model_w2v.wv[word] for word in five_words]
        
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan
    except KeyError:
        return np.nan      # Handle the case where a word is not in the vocabulary

data['average_vector'] = data['title'].apply(get_word_vectors)

# Expand the average vector into several columns
for i in range(model_w2v.vector_size):
    data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


In [9]:
numerical = data.drop(axis=1, columns=["title", "average_vector"])
numerical

Unnamed: 0,stars,reviews,price,isBestSeller,boughtInLastMonth,hash_0,hash_1,hash_2,hash_3,hash_4,...,embedding_dim_91,embedding_dim_92,embedding_dim_93,embedding_dim_94,embedding_dim_95,embedding_dim_96,embedding_dim_97,embedding_dim_98,embedding_dim_99,embedding_dim_100
0,4.5,69,3.25,0,0,0.0,0.0,0.0,0.0,0.0,...,-1.311622,0.840530,-0.029333,-0.317172,-0.263444,0.616612,-1.040384,1.310569,0.146163,0.076832
1,4.0,356,16.99,0,300,0.0,0.0,0.0,0.0,0.0,...,0.798857,0.238643,1.124427,0.222928,0.361501,-1.126004,-0.926842,-0.032637,0.517319,0.413753
2,5.0,1,39.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.454549,0.790870,0.288572,-1.719378,2.737636,-1.064807,0.549627,-1.057401,-2.085778,1.553097
3,4.6,5715,7.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.259878,-0.366005,-0.284666,-1.854801,0.191409,0.232027,-1.037798,1.395128,-0.527646,1.314183
4,5.0,2,189.00,0,0,0.0,0.0,0.0,0.0,0.0,...,0.879743,0.191627,-0.450113,-0.135945,-0.766313,1.797278,-1.646969,0.175847,-0.105215,-1.220792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048317,3.0,16,19.12,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.254574,-0.419862,-0.922883,-1.854843,1.233468,-0.664141,0.699439,-0.355017,0.116157,-0.075391
1048318,4.6,3842,3.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-1.349776,0.065905,1.439999,-2.430358,-1.277970,-0.537201,-0.337828,0.859752,-0.162354,-0.033589
1048319,5.0,5,9.99,0,0,0.0,0.0,0.0,0.0,0.0,...,3.084158,1.007024,1.796641,-0.634729,0.026946,-0.215315,-1.002872,1.358340,-1.383252,-2.073856
1048320,4.5,8747,9.89,0,100,0.0,0.0,0.0,0.0,0.0,...,-0.886025,-0.893399,-0.032808,-0.498111,0.049511,-1.238632,-0.909675,-0.576970,-0.389340,0.700601


In [10]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048322 entries, 0 to 1048321
Columns: 401 entries, stars to embedding_dim_100
dtypes: float32(396), float64(2), int64(3)
memory usage: 1.6 GB


## Data Split and Transformation

In [11]:
y = numerical.price.values
X = numerical

X_train = X[0:-40000].drop(axis=1,columns="price").values
y_train = y[0:-40000]

X_test = X[-40000:].drop(axis=1,columns="price").values
y_test = y[-40000:]

In [12]:
# Log transformation
y_train_transformed = np.log1p(y_train)  

## Modelling

In [13]:
# import modules
from sklearn.linear_model import LinearRegression, ElasticNet
import pickle

### LinearRegression()

In [14]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train_transformed)

y_pred_LR = LR.predict(X_test)

# Inverse-transform to return values back to original scale
y_pred_LR = np.expm1(y_pred_LR)

In [18]:
pkl_out = open('Models/linear_regression.pkl' , 'wb')
pickle.dump(LR, pkl_out)
pkl_out.close()

Linear Regression: 24.6 seconds

### ElasticNet()

In [16]:
EN = ElasticNet()
EN.fit(X_train, y_train)

y_pred_EN = EN.predict(X_test)

In [17]:
pkl_out = open('Models/elastic_net.pkl' , 'wb')
pickle.dump(EN, pkl_out)
pkl_out.close()

### Comparison

In [66]:
compare = pd.DataFrame({"productName": df.title.iloc[-40000:],
                        "rating": df.stars.iloc[-40000:],
                        "Reviews": df.reviews.iloc[-40000:],
                        "actualPrice": y_test,
                        "y_pred_LR": y_pred_LR,
                        "y_pred_EN": y_pred_EN
                        }).reset_index()
compare.sample(10).round(2)

Unnamed: 0,index,productName,rating,Reviews,actualPrice,y_pred_LR,y_pred_EN
10930,1019252,ORIGINAL GOLF GLOBE GAME - Water Globe Golf-Ba...,4.7,295,56.08,28.72,30.74
28136,1036458,"Toddlers and Baby Boys' Rompers, Pack of 3",4.8,15835,27.7,13.51,-14.51
36844,1045166,"Laptop Bag 17 Inch, Large Laptop Messenger Sho...",4.6,2002,29.99,28.86,129.6
20488,1028810,"Thsucords 8K 4K HDMI Cable 1M, High Speed Brai...",4.5,294,7.79,21.78,146.11
3578,1011900,MEGA Pokémon Action Figure Building Toys for K...,4.5,15,18.77,20.18,52.27
21421,1029743,Janod - Pure Wooden Train - 2-in-1 Pull-Along ...,4.6,174,26.02,25.23,17.58
15697,1024019,Refrze Computer Keyboard Stand-PC Keyboard Sta...,4.7,2461,18.34,54.17,113.95
8006,1016328,Kuryakyn 3262 Motorcycle Lighting Accessory: O...,4.2,23,176.24,31.61,45.79
30902,1039224,Chear Shea Butter Cream 500ml with Olive & Alo...,4.1,10,7.95,10.14,17.28
19656,1027978,OCDSLYGB Ku-romi Plush Toys Cartoon Little Dev...,4.5,3,9.99,10.61,15.71


## Evaluation

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse = mean_squared_error(y_test, y_pred_LR)
print(f'Mean Squared Error LinearRegression: {mse}')

Mean Squared Error LinearRegression: 14582.455117647049


In [None]:
mse = mean_squared_error(y_test, y_pred_EN)
print(f'Mean Squared Error ElasticNet: {mse}')

Mean Squared Error ElasticNet: 14540.771295788732
