In [1]:
# imports
import pandas as pd, numpy as np

In [2]:
# data import
df = pd.read_csv("amazon_data.csv")
df = df.drop(columns=["asin", "imgUrl", "productURL"],axis=1)
df = df[df["reviews"] > 0]
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
1,"Anker Soundcore mini, Super-Portable Bluetooth...",4.7,98099,23.99,True,0,Hi-Fi Speakers
2,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
3,"Echo Dot with clock (5th generation, 2022 rele...",4.7,7205,31.99,False,0,Hi-Fi Speakers
4,Introducing Echo Pop | Full sound compact Wi-F...,4.6,1881,17.99,False,0,Hi-Fi Speakers
...,...,...,...,...,...,...,...
2222733,Speed MaxX BODY ARMOUR CE MOTORBIKE/MOTORCYCLE...,3.6,66,49.99,False,0,Motorbike Clothing
2222734,Motorcycle Clothing Suit - Motorbike Suit With...,4.1,5,158.99,False,0,Motorbike Clothing
2222735,GREAT BIKERS GEAR - Bobber Cafe Brat Style Lea...,3.6,12,14.99,False,0,Motorbike Clothing
2222740,Texpeed Mens Motorcycle Motorbike Biker Trouse...,4.3,404,79.99,False,0,Motorbike Clothing


## Pre-Processing

### Shuffle rows
Rows have to be shuffled otherwise there will be issues due the test-train split. Data is currently sorted by categoryName, this will cause problems later unless shuffled before modelling.

In [3]:
index_array = np.arange(len(df))
np.random.shuffle(index_array)

df = df.iloc[index_array].reset_index(drop=True)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,Fluid Audio DS5: Decoupling adjustable monitor...,4.4,394,62.00,False,0,Recording & Computer
1,Panana Round White Colored Top Small Medium Ki...,4.5,42,99.99,False,0,Dining Room Furniture
2,CJ Wildbird Foods Ltd National Trust Ultimate ...,4.0,1,5.99,False,0,Bird & Wildlife Care
3,rOtring Tikky Ballpoint Pens with Rubberised G...,4.6,30,9.69,False,0,"Pens, Pencils & Writing Supplies"
4,Askham Mens Ultra Warm Wind and Water Resistan...,4.7,86,59.00,False,0,Men
...,...,...,...,...,...,...,...
1048317,Ben Sayers Men's M8 Golf Package Set,4.2,22,219.00,False,0,Sports & Outdoors
1048318,Grandson 9th Birthday Card & Badge - Gold Foil...,4.6,243,3.79,False,0,Office Paper Products
1048319,Gravity LTS 01 B SET 1 - Adjustable Stand for ...,4.5,26,77.00,False,0,Furniture & Lighting
1048320,Lottie Pool Party Doll | Bath | Fun Bathtub To...,4.2,697,23.99,False,0,Dolls & Accessories


### isBestSeller: bool -> int

In [4]:
dict_map = {True: 1, False: 0}
df['isBestSeller'] = df['isBestSeller'].map(dict_map)

### categoryName: string -> float  -- FeatureHasher

In [5]:
n_features = len(df.categoryName.unique())

In [6]:
from sklearn.feature_extraction import FeatureHasher

# Ensure everything is in string format for hashing.
categories = df.categoryName.astype(str)  

# Convert to column to list of lists for iterating w/ FeatureHasher
categories = [[category] for category in categories]

# FeatureHasher
hasher = FeatureHasher(n_features=n_features, input_type="string")
X_category = hasher.transform(categories).toarray().astype("float32")

# sparse matrix -> array -> DataFrame
hashed_df = pd.DataFrame(X_category, columns=[f"hash_{i}" for i in range(n_features)])

# Concatenate dataframes
data = pd.concat([df, hashed_df], axis=1)
data = data.drop(axis=1,columns="categoryName")

### title: string -> int -- Word2Vec

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [8]:
model_path = "word2vec_model.model"
model_w2v = Word2Vec.load(model_path)

# Define a function to get the word vectors for the first 5 words of a product name
def get_word_vectors(product_name):
    try:
        five_words = word_tokenize(product_name.lower())[:5]
        vectors = [model_w2v.wv[word] for word in five_words]
        
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan
    except KeyError:
        return np.nan      # Handle the case where a word is not in the vocabulary

data['average_vector'] = data['title'].apply(get_word_vectors)

# Expand the average vector into several columns
for i in range(model_w2v.vector_size):
    data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


In [9]:
numerical = data.drop(axis=1, columns=["title", "average_vector"])
numerical

Unnamed: 0,stars,reviews,price,isBestSeller,boughtInLastMonth,hash_0,hash_1,hash_2,hash_3,hash_4,...,embedding_dim_91,embedding_dim_92,embedding_dim_93,embedding_dim_94,embedding_dim_95,embedding_dim_96,embedding_dim_97,embedding_dim_98,embedding_dim_99,embedding_dim_100
0,4.4,394,62.00,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.784108,1.727504,0.398827,-0.347906,-0.105918,1.031320,-0.388690,0.067079,-0.492236,-0.779401
1,4.5,42,99.99,0,0,0.0,0.0,0.0,0.0,0.0,...,0.559722,-0.244209,-0.674413,-1.115070,0.446629,-1.400964,-0.645207,-0.634883,-1.278760,1.123274
2,4.0,1,5.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.491073,0.132468,0.966013,-0.762000,0.854790,-0.099999,-0.818676,1.121775,-0.140859,-1.006717
3,4.6,30,9.69,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.673670,-0.542926,-0.795402,-0.988001,-1.232087,-1.118050,-0.841062,-0.647553,0.511001,0.562902
4,4.7,86,59.00,0,0,0.0,0.0,0.0,0.0,0.0,...,1.572842,0.551809,-0.479322,-0.521322,-0.541257,-0.381233,-0.096763,-1.590582,-2.148561,-0.314447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048317,4.2,22,219.00,0,0,0.0,0.0,0.0,0.0,0.0,...,0.357379,-0.748968,0.548331,-1.006143,0.890936,-0.005505,0.080176,-1.411288,-0.289495,-0.674728
1048318,4.6,243,3.79,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.747892,0.094252,1.050392,-2.305690,-0.660265,-1.698762,0.359324,1.127983,-0.007587,0.243509
1048319,4.5,26,77.00,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.596305,0.534012,-0.544972,-0.914177,0.311290,0.725389,-1.185010,0.413854,-1.877134,-1.040140
1048320,4.2,697,23.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.215904,2.243337,-0.133761,-1.200215,1.309054,-0.861897,-0.981664,-1.352625,-0.055042,0.513641


In [10]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048322 entries, 0 to 1048321
Columns: 401 entries, stars to embedding_dim_100
dtypes: float32(396), float64(2), int64(3)
memory usage: 1.6 GB


## Data Split and Transformation

In [11]:
y = numerical.price.values
X = numerical

X_train = X[0:-40000].drop(axis=1,columns="price").values
y_train = y[0:-40000]

X_test = X[-40000:].drop(axis=1,columns="price").values
y_test = y[-40000:]

In [12]:
# Log transformation
y_train_transformed = np.log1p(y_train)  

## Modelling

In [13]:
# import modules
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet

### LinearRegression()

In [16]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train_transformed)

y_pred_LR = LR.predict(X_test)

# Inverse-transform to return values back to original scale
y_pred_LR = np.expm1(y_pred_LR)

Linear Regression: 24.6 seconds

### ElasticNet()

In [18]:
EN = ElasticNet()
EN.fit(X_train, y_train)

y_pred_EN = EN.predict(X_test)

### Comparison

In [20]:
compare = pd.DataFrame({"productName": df.title.iloc[-40000:],
                        "rating": df.stars.iloc[-40000:],
                        "Reviews": df.reviews.iloc[-40000:],
                        "actualPrice": y_test,
                        "y_pred_LR": y_pred_LR,
                        "y_pred_EN": y_pred_EN
                        }).reset_index()
compare

Unnamed: 0,index,productName,rating,Reviews,actualPrice,y_pred_LR,y_pred_EN
0,1008322,Direct Wicker Set of 2 43 x 43cm Waterproof St...,4.4,99,9.99,12.620329,-4.915920
1,1008323,Johnsons Seeds - Pictorial Pack - Flower - Cat...,4.2,57,2.65,6.136978,18.826689
2,1008324,"Elizabeth Arden Eight Hour SPF 15 Lip Stick, 3...",4.7,323,13.14,19.207453,48.496381
3,1008325,All Trade Direct 100 x Red 6.3mm Female Fully ...,4.6,80,4.13,6.170773,21.874062
4,1008326,ViVo © 12kg Concrete Black Heavy Duty Garden B...,4.0,37,21.99,41.870787,79.169934
...,...,...,...,...,...,...,...
39995,1048317,Ben Sayers Men's M8 Golf Package Set,4.2,22,219.00,44.471925,74.403187
39996,1048318,Grandson 9th Birthday Card & Badge - Gold Foil...,4.6,243,3.79,8.512100,49.231402
39997,1048319,Gravity LTS 01 B SET 1 - Adjustable Stand for ...,4.5,26,77.00,39.028568,48.266537
39998,1048320,Lottie Pool Party Doll | Bath | Fun Bathtub To...,4.2,697,23.99,15.609805,19.590886


## Evaluation

In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
mse = mean_squared_error(y_test, y_pred_LR)
print(f'Mean Squared Error LinearRegression: {mse}')

Mean Squared Error LinearRegression: 14582.455117647049


In [24]:
mse = mean_squared_error(y_test, y_pred_EN)
print(f'Mean Squared Error ElasticNet: {mse}')

Mean Squared Error ElasticNet: 14540.771295788732
