In [1]:
# imports
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [3]:
# data import
df = pd.read_csv("amazon_data.csv")
df = df.drop(columns=["asin", "imgUrl", "productURL"],axis=1)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
1,"Anker Soundcore mini, Super-Portable Bluetooth...",4.7,98099,23.99,True,0,Hi-Fi Speakers
2,"Echo Dot (5th generation, 2022 release) | Big ...",4.7,15308,21.99,False,0,Hi-Fi Speakers
3,"Echo Dot with clock (5th generation, 2022 rele...",4.7,7205,31.99,False,0,Hi-Fi Speakers
4,Introducing Echo Pop | Full sound compact Wi-F...,4.6,1881,17.99,False,0,Hi-Fi Speakers
...,...,...,...,...,...,...,...
2222737,"Motorbike Armour, Motorbike Clothing, Cycling ...",0.0,0,22.34,False,0,Motorbike Clothing
2222738,PROFIRST Waterproof Motorcycle 2 Piece Ladies ...,0.0,0,97.99,False,0,Motorbike Clothing
2222739,Men’s Motorcycle Motorbike Biker Trousers Kevl...,0.0,0,52.99,False,0,Motorbike Clothing
2222740,Texpeed Mens Motorcycle Motorbike Biker Trouse...,4.3,404,79.99,False,0,Motorbike Clothing


## Pre-Processing

### Shuffle rows
Rows have to be shuffled otherwise there will be issues due the test-train split. Data is currently sorted by categoryName, this will cause problems later unless shuffled before modelling.

In [4]:
index_array = np.arange(len(df))
np.random.shuffle(index_array)

df = df.iloc[index_array].reset_index(drop=True)
df

Unnamed: 0,title,stars,reviews,price,isBestSeller,boughtInLastMonth,categoryName
0,Stainless steel necklace for women with red gl...,0.0,0,15.54,False,0,Handmade
1,SKYRICH HJTX14AHQ-FP Starter Battery,4.0,43,124.36,False,0,Motorbike Batteries
2,351 Unisex Crew Socks Mexico Hispanic Mexico I...,0.0,0,8.98,False,0,Sports & Outdoors
3,Ballet Dance Shoes Cotton Drawstring Bag (Medium),5.0,4,6.99,False,0,Handmade Gifts
4,"Screen Receiver Dongle, Firm ABS Display Recei...",0.0,0,19.39,False,0,Media Streaming Devices
...,...,...,...,...,...,...,...
2222737,WJJ Digital Golf Range Finder,0.0,0,85.05,False,0,Sports & Outdoors
2222738,USB C to 3.5mm Audio Aux Jack Cable 1.2m [HiFi...,4.3,1777,12.99,False,0,Hi-Fi & Home Audio Accessories
2222739,"Bimormat XL Gaming Map Mouse Mat, Large Mouse ...",4.7,36,6.99,False,0,"Keyboards, Mice & Input Devices"
2222740,Teddy Fleece Jacket Women's Long Colour Block ...,0.0,0,13.84,False,0,Sports & Outdoors


### Drop data
Unfortunately, my computer is unable to process the 2+ million rows of data; therefore, I will limit the data to 800,000 rows, randomly selected to prevent bias.

In [5]:
df = df.iloc[:800000,:]

### isBestSeller: bool -> int

In [6]:
dict_map = {True: 1, False: 0}
df['isBestSeller'] = df['isBestSeller'].map(dict_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['isBestSeller'] = df['isBestSeller'].map(dict_map)


### categoryName: string -> float  -- FeatureHasher

In [7]:
n_features = len(df.categoryName.unique())

In [8]:
from sklearn.feature_extraction import FeatureHasher

# Ensure everything is in string format for hashing.
categories = df.categoryName.astype(str)  

# Convert to column to list of lists for iterating w/ FeatureHasher
categories = [[category] for category in categories]

# FeatureHasher
hasher = FeatureHasher(n_features=n_features, input_type="string")
X_category = hasher.transform(categories).toarray().astype("float32")

# sparse matrix -> array -> DataFrame
hashed_df = pd.DataFrame(X_category, columns=[f"hash_{i}" for i in range(n_features)])

# Concatenate dataframes
data = pd.concat([df, hashed_df], axis=1)
data = data.drop(axis=1,columns="categoryName")

### title: string -> int -- Word2Vec

In [9]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

# Tokenize product names
product_names = df['title']
tokenized_product_names = [word_tokenize(name.lower()) for name in product_names]

# Train Word2Vec model
model_w2v = Word2Vec(sentences=tokenized_product_names, vector_size=100, window=5, min_count=1, workers=4)

# Save model
model_w2v.save('Models/word2vec_model.model')

In [10]:
model_path = "Models/word2vec_model.model"
model_w2v = Word2Vec.load(model_path)

# Define a function to get the word vectors for the first 5 words of a product name
def get_word_vectors(product_name):
    try:
        five_words = word_tokenize(product_name.lower())[:5]
        vectors = [model_w2v.wv[word] for word in five_words]
        
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.nan
    except KeyError:
        return np.nan      # Handle the case where a word is not in the vocabulary

data['average_vector'] = data['title'].apply(get_word_vectors)

# Expand the average vector into several columns
for i in range(model_w2v.vector_size):
    data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)
  data[f'embedding_dim_{i + 1}'] = data['average_vector'].apply(lambda x: x[i] if isinstance(x, np.ndarray) else np.nan)


In [11]:
numerical = data.drop(axis=1, columns=["title", "average_vector"])
numerical

Unnamed: 0,stars,reviews,price,isBestSeller,boughtInLastMonth,hash_0,hash_1,hash_2,hash_3,hash_4,...,embedding_dim_91,embedding_dim_92,embedding_dim_93,embedding_dim_94,embedding_dim_95,embedding_dim_96,embedding_dim_97,embedding_dim_98,embedding_dim_99,embedding_dim_100
0,0.0,0,15.54,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.089945,-0.342366,0.175389,0.113756,-0.012288,-2.203947,-0.573082,-1.397789,0.591498,0.485868
1,4.0,43,124.36,0,0,0.0,0.0,0.0,0.0,0.0,...,0.176543,-0.040026,0.239494,0.772760,0.616561,-1.128274,1.480288,-1.645328,0.783598,-0.512152
2,0.0,0,8.98,0,0,0.0,0.0,0.0,0.0,0.0,...,-1.741854,-0.996599,-0.555048,-1.936778,1.694185,-0.825065,0.662055,-1.162920,-0.346586,-0.389268
3,5.0,4,6.99,0,0,0.0,0.0,0.0,0.0,0.0,...,0.896354,1.387792,-2.036050,0.492084,3.083260,-2.110203,0.155824,-1.655594,-1.259413,2.168156
4,0.0,0,19.39,0,0,0.0,0.0,0.0,0.0,0.0,...,1.182369,1.785287,0.845487,0.146539,-0.703357,2.441475,-1.069510,-0.390987,0.331024,-1.837203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,4.0,583,42.44,0,0,0.0,0.0,0.0,0.0,0.0,...,0.882507,-0.478334,0.408631,-0.554997,-1.186026,0.933150,-1.188907,0.954343,-0.095035,-0.756942
799996,4.4,69,7.99,0,0,0.0,0.0,0.0,0.0,0.0,...,-0.158506,0.473827,-0.183446,0.720124,-0.369839,1.050586,-1.233975,0.998851,-1.099051,0.615409
799997,3.9,422,16.19,0,0,0.0,0.0,0.0,0.0,0.0,...,0.528936,0.149583,1.647077,0.196005,-1.036805,-0.699227,-0.717188,-0.910288,0.438531,-0.114222
799998,0.0,0,30.40,0,0,0.0,0.0,0.0,0.0,0.0,...,-1.564633,0.079645,1.527272,0.603190,0.195488,0.018023,0.597337,-1.516656,1.816189,0.206976


In [12]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Columns: 401 entries, stars to embedding_dim_100
dtypes: float32(396), float64(2), int64(3)
memory usage: 1.2 GB


## Data Split and Transformation

In [13]:
y = numerical.price.values
X = numerical.drop(axis=1,columns="price").values

X_train = X[0:-80000]
y_train = y[0:-80000]

X_test = X[-80000:]
y_test = y[-80000:]

In [14]:
# Log transformation
y_train_transformed = np.log1p(y_train)  

## Modelling

In [15]:
# import modules
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

### LinearRegression()

In [16]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train_transformed)

y_pred_LR = LR.predict(X_test).astype("float32")

# Inverse-transform to return values back to original scale
y_pred_LR = np.expm1(y_pred_LR)

Linear Regression: 24.6 seconds

### Lasso()

In [17]:
# LASSO
dfLasso = pd.DataFrame()
for i in range(0,10):
    LASSO = Lasso(alpha=(i+1)/100)
    LASSO.fit(X_train, y_train)

    dfLasso[f'LASSO_{(i+1)/100}'] = LASSO.predict(X_test)

In [18]:
dfLasso

Unnamed: 0,LASSO_0.01,LASSO_0.02,LASSO_0.03,LASSO_0.04,LASSO_0.05,LASSO_0.06,LASSO_0.07,LASSO_0.08,LASSO_0.09,LASSO_0.1
0,95.107851,98.421068,97.072227,96.023394,95.228373,94.565014,94.377392,94.289977,94.181816,94.149741
1,148.934893,148.049036,147.025183,145.938631,144.890447,143.947455,143.143419,142.330235,141.528568,140.705328
2,66.613944,66.696903,66.722384,66.673511,66.677848,66.669763,66.762913,66.904085,66.978130,67.007151
3,117.774763,118.916877,120.060734,121.187462,122.309228,123.453565,124.578231,125.618049,126.683478,127.742552
4,121.103378,121.312502,121.396175,121.434040,121.385748,121.297185,121.440738,121.591548,121.685578,121.773124
...,...,...,...,...,...,...,...,...,...,...
79995,178.802110,176.253553,173.814913,171.330731,168.864698,166.360021,163.780091,161.194718,158.703330,156.152567
79996,10.759294,12.685708,14.694625,16.850160,19.062535,21.299442,20.216222,19.205420,18.369867,17.632388
79997,-50.216998,-48.457892,-46.255751,-43.930179,-41.391109,-38.691646,-36.015251,-33.284237,-30.696494,-28.094571
79998,150.201081,155.606233,160.612497,165.471357,170.371057,175.317317,180.123924,184.914092,188.422305,188.872843


### ElasticNet()

In [19]:
EN = ElasticNet()
EN.fit(X_train, y_train)

y_pred_EN = EN.predict(X_test)

### Comparison

In [25]:
compare = pd.DataFrame({"productName": df.title.iloc[-80000:],
                        "rating": df.stars.iloc[-80000:],
                        "Reviews": df.reviews.iloc[-80000:],
                        "actualPrice": y_test,
                        "y_pred_LR": y_pred_LR,
                        "y_pred_EN": y_pred_EN
                        }).reset_index()
compare

Unnamed: 0,index,productName,rating,Reviews,actualPrice,y_pred_LR,y_pred_EN
0,720000,"Sanderson Cluny Cushion, Floral Bird Print Pil...",0.0,0,32.00,25.706312,127.028894
1,720001,Premium Lightweight Golf Cart Bag Golf Stand B...,0.0,0,621.99,55.426556,157.941635
2,720002,Rogelli Women's Eabel Running Short Sleeve T-S...,3.0,2,30.49,27.303505,70.575540
3,720003,"Fishing Tackle Advent Calendar, Christmas Coun...",0.0,0,20.89,29.437006,138.597538
4,720004,NIKE Men's Df FLX T-Shirt,4.3,8,23.81,70.822723,85.640577
...,...,...,...,...,...,...,...
79995,799995,Patriot Supersonic Rage Pro 128GB USB 3.2 Gen ...,4.0,583,42.44,60.155571,141.663033
79996,799996,YanBan 2pcs Hot Tub Cartridge Filters Compatib...,4.4,69,7.99,13.884083,16.266421
79997,799997,ORANGE SPAIN - Prepaid SIM Card (Tu Mundo) 27 ...,3.9,422,16.19,22.144299,-4.098054
79998,799998,Tachometer Odometer Motorcycle Instrument C70 ...,0.0,0,30.40,52.938950,203.190187


## Evaluation

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_LR)
print(f'Mean Squared Error LinearRegression: {mse}')

Mean Squared Error LinearRegression: 98547.53338305709


In [28]:
for i in range(dfLasso.shape[1]):
    mse = mean_squared_error(y_test, dfLasso.iloc[:,i])
    print(f'Mean Squared Error LASSO alpha={(i+1)/100}: {mse}')

Mean Squared Error LASSO alpha=0.01: 88137.39987212194
Mean Squared Error LASSO alpha=0.02: 88147.0185215972
Mean Squared Error LASSO alpha=0.03: 88166.4894799096
Mean Squared Error LASSO alpha=0.04: 88191.8880527946
Mean Squared Error LASSO alpha=0.05: 88219.95570268591
Mean Squared Error LASSO alpha=0.06: 88251.76417008176
Mean Squared Error LASSO alpha=0.07: 88284.77816276558
Mean Squared Error LASSO alpha=0.08: 88322.43521133813
Mean Squared Error LASSO alpha=0.09: 88360.47979638947
Mean Squared Error LASSO alpha=0.1: 88401.53156854017
