In [1]:
# imports
import pandas as pd, numpy as np

In [4]:
# data import
df = pd.read_csv("amazon_data.csv")
df = df.drop(columns=["asin", "imgUrl", "productURL"],axis=1)
df = df[df["reviews"] > 0]
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
1,"Anker Soundcore mini, Super-Portable Bluetooth...",4.7,98099,23.99,True,0,Hi-Fi Speakers
2,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
3,"Echo Dot with clock (5th generation, 2022 rele...",4.7,7205,31.99,False,0,Hi-Fi Speakers
4,Introducing Echo Pop | Full sound compact Wi-F...,4.6,1881,17.99,False,0,Hi-Fi Speakers
...,...,...,...,...,...,...,...
2222733,Speed MaxX BODY ARMOUR CE MOTORBIKE/MOTORCYCLE...,3.6,66,49.99,False,0,Motorbike Clothing
2222734,Motorcycle Clothing Suit - Motorbike Suit With...,4.1,5,158.99,False,0,Motorbike Clothing
2222735,GREAT BIKERS GEAR - Bobber Cafe Brat Style Lea...,3.6,12,14.99,False,0,Motorbike Clothing
2222740,Texpeed Mens Motorcycle Motorbike Biker Trouse...,4.3,404,79.99,False,0,Motorbike Clothing


## Pre-Processing

### Shuffle rows
Rows have to be shuffled otherwise there will be issues due the test-train split. Data is currently sorted by categoryName, this will cause problems later unless shuffled before modelling.

In [5]:
index_array = np.arange(len(df))
np.random.shuffle(index_array)

df = df.iloc[index_array].reset_index(drop=True)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,MEIRIYFA USB C to 6.35mm Male 1/4 TS Guitar Ca...,4.4,9,13.80,False,0,Guitars & Gear
1,"Domaier - Electric Knife Sharpener, Two Stage ...",4.1,280,34.99,False,0,Small Kitchen Appliances
2,RØDE Podcaster End-address Broadcast Dynamic U...,4.0,1,133.99,False,0,Microphones
3,Giorgio Armani SI Eau De Parfum Spray 50ml (1....,4.7,1313,73.60,False,100,Fragrances
4,4 Pack Christmas Battery Fairy Lights 5m Dark ...,4.4,155,29.99,False,0,String Lights
...,...,...,...,...,...,...,...
1048317,JOYIN MIlitary Vehicle Toy Set of Friction Pow...,4.4,582,22.99,False,0,Kids' Play Figures
1048318,Whiplash [DVD] [2015],4.7,1658,5.73,False,0,Portable Sound & Video Products
1048319,Just Treats White Chocolate Snowies (500g Shar...,4.6,922,9.79,False,200,Grocery
1048320,Tayo the little Bus Shooting Cars Tayo Rogi Ra...,4.8,11,29.90,False,0,Games & Game Accessories


### isBestSeller: bool -> int

In [6]:
dict_map = {True: 1, False: 0}
df['isBestSeller'] = df['isBestSeller'].map(dict_map)

### categoryName: string -> float  -- FeatureHasher

In [7]:
n_features = len(df.categoryName.unique())

In [8]:
from sklearn.feature_extraction import FeatureHasher

# Ensure everything is in string format for hashing.
categories = df.categoryName.astype(str)  

# Convert to column to list of lists for iterating w/ FeatureHasher
categories = [[category] for category in categories]

# FeatureHasher
hasher = FeatureHasher(n_features=n_features, input_type="string")
X_category = hasher.transform(categories).toarray().astype("float32")

# sparse matrix -> array -> DataFrame
hashed_df = pd.DataFrame(X_category, columns=[f"hash_{i}" for i in range(n_features)])

# Concatenate dataframes
data = pd.concat([df, hashed_df], axis=1)
data = data.drop(axis=1,columns="categoryName")

### title: string -> int -- Word2Vec

In [9]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

# Tokenize product names
product_names = df['title']
tokenized_product_names = [word_tokenize(name.lower()) for name in product_names]

# Train Word2Vec model
model_w2v = Word2Vec(sentences=tokenized_product_names, vector_size=100, window=5, min_count=1, workers=4)

# Save model
model_w2v.save('word2vec_model.model')

In [11]:
model_path = "word2vec_model.model"
model_w2v = Word2Vec.load(model_path)

# Define a function to get the word vectors for the first 5 words of a product name
def get_word_vectors(product_name):
    try:
        five_words = word_tokenize(product_name.lower())[:5]
        vectors = [model_w2v.wv[word] for word in five_words]
        
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan
    except KeyError:
        return np.nan      # Handle the case where a word is not in the vocabulary

data['average_vector'] = data['title'].apply(get_word_vectors)

# Expand the average vector into several columns
for i in range(model_w2v.vector_size):
    data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


In [12]:
numerical = data.drop(axis=1, columns=["title", "average_vector"])
numerical

Unnamed: 0,stars,reviews,price,isBestSeller,boughtInLastMonth,hash_0,hash_1,hash_2,hash_3,hash_4,...,embedding_dim_91,embedding_dim_92,embedding_dim_93,embedding_dim_94,embedding_dim_95,embedding_dim_96,embedding_dim_97,embedding_dim_98,embedding_dim_99,embedding_dim_100
0,4.4,9,13.80,0,0,0.0,0.0,0.0,0.0,0.0,...,0.826105,1.156073,1.842180,0.316668,-1.054151,-0.643152,-0.430519,1.449162,-2.041727,-1.901704
1,4.1,280,34.99,0,0,0.0,0.0,0.0,0.0,0.0,...,0.765967,-0.260327,-0.557638,0.450153,-1.615695,-1.092276,0.047348,0.535927,0.516813,1.865460
2,4.0,1,133.99,0,0,0.0,0.0,0.0,0.0,0.0,...,0.105971,0.585576,-0.779985,-0.316929,-0.222126,0.947067,-0.985516,0.476288,0.233794,-1.786444
3,4.7,1313,73.60,0,100,0.0,0.0,0.0,0.0,0.0,...,-1.148339,0.189675,0.250219,-0.314164,2.997337,-0.560797,0.667171,1.111190,-0.438426,-0.762076
4,4.4,155,29.99,0,0,0.0,0.0,0.0,0.0,0.0,...,0.820917,0.443626,1.799702,-0.380921,0.057904,-1.012501,-0.324906,-1.019856,-0.505947,0.048398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048317,4.4,582,22.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.138903,-0.013354,1.061646,-0.272234,1.112503,-1.054935,-1.921008,-0.211019,-0.291236,0.842217
1048318,4.7,1658,5.73,0,0,0.0,0.0,0.0,0.0,0.0,...,0.991059,0.400077,1.824975,-0.355487,-1.803872,-2.186057,-0.196495,-0.343205,0.769424,-3.054296
1048319,4.6,922,9.79,0,200,0.0,0.0,0.0,0.0,0.0,...,-0.571287,0.188052,1.330669,-2.124945,-1.004256,-1.316806,0.324159,1.343417,-0.739022,1.038567
1048320,4.8,11,29.90,0,0,0.0,0.0,0.0,0.0,0.0,...,-1.884923,0.383576,0.856193,-1.387399,0.631696,-0.067903,-0.478689,1.642612,0.797117,0.332386


In [13]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048322 entries, 0 to 1048321
Columns: 401 entries, stars to embedding_dim_100
dtypes: float32(396), float64(2), int64(3)
memory usage: 1.6 GB


## Data Split and Transformation

In [15]:
y = numerical.price.values
X = numerical.drop(axis=1,columns="price").values

X_train = X[0:-40000]
y_train = y[0:-40000]

X_test = X[-40000:]
y_test = y[-40000:]

MemoryError: Unable to allocate 3.12 GiB for an array with shape (400, 1048322) and data type float64

In [None]:
# Log transformation
y_train_transformed = np.log1p(y_train)  

## Modelling

In [None]:
# import modules
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet

### LinearRegression()

In [None]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train_transformed)

y_pred_LR = LR.predict(X_test).astype("float32")

# Inverse-transform to return values back to original scale
y_pred_LR = np.expm1(y_pred_LR)

Linear Regression: 24.6 seconds

### Lasso()

In [None]:
# LASSO
dfLasso = pd.DataFrame()
for i in range(0,10):
    LASSO = Lasso(alpha=(i+1)/100)
    LASSO.fit(X_train, y_train)

    dfLasso[f'LASSO_{(i+1)/100}'] = LASSO.predict(X_test)

In [18]:
dfLasso

Unnamed: 0,LASSO_0.01,LASSO_0.02,LASSO_0.03,LASSO_0.04,LASSO_0.05,LASSO_0.06,LASSO_0.07,LASSO_0.08,LASSO_0.09,LASSO_0.1
0,95.107851,98.421068,97.072227,96.023394,95.228373,94.565014,94.377392,94.289977,94.181816,94.149741
1,148.934893,148.049036,147.025183,145.938631,144.890447,143.947455,143.143419,142.330235,141.528568,140.705328
2,66.613944,66.696903,66.722384,66.673511,66.677848,66.669763,66.762913,66.904085,66.978130,67.007151
3,117.774763,118.916877,120.060734,121.187462,122.309228,123.453565,124.578231,125.618049,126.683478,127.742552
4,121.103378,121.312502,121.396175,121.434040,121.385748,121.297185,121.440738,121.591548,121.685578,121.773124
...,...,...,...,...,...,...,...,...,...,...
79995,178.802110,176.253553,173.814913,171.330731,168.864698,166.360021,163.780091,161.194718,158.703330,156.152567
79996,10.759294,12.685708,14.694625,16.850160,19.062535,21.299442,20.216222,19.205420,18.369867,17.632388
79997,-50.216998,-48.457892,-46.255751,-43.930179,-41.391109,-38.691646,-36.015251,-33.284237,-30.696494,-28.094571
79998,150.201081,155.606233,160.612497,165.471357,170.371057,175.317317,180.123924,184.914092,188.422305,188.872843


### ElasticNet()

In [19]:
EN = ElasticNet()
EN.fit(X_train, y_train)

y_pred_EN = EN.predict(X_test)

### Comparison

In [25]:
compare = pd.DataFrame({"productName": df.title.iloc[-80000:],
                        "rating": df.stars.iloc[-80000:],
                        "Reviews": df.reviews.iloc[-80000:],
                        "actualPrice": y_test,
                        "y_pred_LR": y_pred_LR,
                        "y_pred_EN": y_pred_EN
                        }).reset_index()
compare

Unnamed: 0,index,productName,rating,Reviews,actualPrice,y_pred_LR,y_pred_EN
0,720000,"Sanderson Cluny Cushion, Floral Bird Print Pil...",0.0,0,32.00,25.706312,127.028894
1,720001,Premium Lightweight Golf Cart Bag Golf Stand B...,0.0,0,621.99,55.426556,157.941635
2,720002,Rogelli Women's Eabel Running Short Sleeve T-S...,3.0,2,30.49,27.303505,70.575540
3,720003,"Fishing Tackle Advent Calendar, Christmas Coun...",0.0,0,20.89,29.437006,138.597538
4,720004,NIKE Men's Df FLX T-Shirt,4.3,8,23.81,70.822723,85.640577
...,...,...,...,...,...,...,...
79995,799995,Patriot Supersonic Rage Pro 128GB USB 3.2 Gen ...,4.0,583,42.44,60.155571,141.663033
79996,799996,YanBan 2pcs Hot Tub Cartridge Filters Compatib...,4.4,69,7.99,13.884083,16.266421
79997,799997,ORANGE SPAIN - Prepaid SIM Card (Tu Mundo) 27 ...,3.9,422,16.19,22.144299,-4.098054
79998,799998,Tachometer Odometer Motorcycle Instrument C70 ...,0.0,0,30.40,52.938950,203.190187


## Evaluation

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_LR)
print(f'Mean Squared Error LinearRegression: {mse}')

Mean Squared Error LinearRegression: 98547.53338305709


In [28]:
for i in range(dfLasso.shape[1]):
    mse = mean_squared_error(y_test, dfLasso.iloc[:,i])
    print(f'Mean Squared Error LASSO alpha={(i+1)/100}: {mse}')

Mean Squared Error LASSO alpha=0.01: 88137.39987212194
Mean Squared Error LASSO alpha=0.02: 88147.0185215972
Mean Squared Error LASSO alpha=0.03: 88166.4894799096
Mean Squared Error LASSO alpha=0.04: 88191.8880527946
Mean Squared Error LASSO alpha=0.05: 88219.95570268591
Mean Squared Error LASSO alpha=0.06: 88251.76417008176
Mean Squared Error LASSO alpha=0.07: 88284.77816276558
Mean Squared Error LASSO alpha=0.08: 88322.43521133813
Mean Squared Error LASSO alpha=0.09: 88360.47979638947
Mean Squared Error LASSO alpha=0.1: 88401.53156854017
