In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [10]:
df = pd.read_csv("clean_data.csv", index_col=False)

In [11]:
df.shape

(185950, 6)

In [12]:
df['Order Date'] = pd.to_datetime(df['Order Date'])

In [13]:
def augmented_data(data):
    def get_city(address):
        return address.split(',')[1]

    def get_state(address):
        return address.split(',')[2].split(' ')[1]

    data['year'] = data['Order Date'].dt.year
    data['month'] = data['Order Date'].dt.month
    data['day'] = data['Order Date'].dt.day
    data['dayofweek'] = data['Order Date'].dt.dayofweek
    data['sales'] = data['Quantity Ordered'] * data['Price Each']

    data['cities'] = data['Purchase Address'].apply(
        lambda x: f"{get_city(x)} ({get_state(x)})")

    return data


sales_data = augmented_data(df)
sales_data.head()


Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,year,month,day,dayofweek,sales,cities
0,236670,Wired Headphones,2,11.99,2019-08-31 22:21:00,"359 Spruce St, Seattle, WA 98101",2019,8,31,5,23.98,Seattle (WA)
1,236671,Bose SoundSport Headphones,1,99.99,2019-08-15 15:11:00,"492 Ridge St, Dallas, TX 75001",2019,8,15,3,99.99,Dallas (TX)
2,236672,iPhone,1,700.0,2019-08-06 14:40:00,"149 7th St, Portland, OR 97035",2019,8,6,1,700.0,Portland (OR)
3,236673,AA Batteries (4-pack),2,3.84,2019-08-29 20:59:00,"631 2nd St, Los Angeles, CA 90001",2019,8,29,3,7.68,Los Angeles (CA)
4,236674,AA Batteries (4-pack),2,3.84,2019-08-15 19:53:00,"736 14th St, New York City, NY 10001",2019,8,15,3,7.68,New York City (NY)


In [14]:
sales_data = sales_data[sales_data['year'] != 2020]
sales_data

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,year,month,day,dayofweek,sales,cities
0,236670,Wired Headphones,2,11.99,2019-08-31 22:21:00,"359 Spruce St, Seattle, WA 98101",2019,8,31,5,23.98,Seattle (WA)
1,236671,Bose SoundSport Headphones,1,99.99,2019-08-15 15:11:00,"492 Ridge St, Dallas, TX 75001",2019,8,15,3,99.99,Dallas (TX)
2,236672,iPhone,1,700.00,2019-08-06 14:40:00,"149 7th St, Portland, OR 97035",2019,8,6,1,700.00,Portland (OR)
3,236673,AA Batteries (4-pack),2,3.84,2019-08-29 20:59:00,"631 2nd St, Los Angeles, CA 90001",2019,8,29,3,7.68,Los Angeles (CA)
4,236674,AA Batteries (4-pack),2,3.84,2019-08-15 19:53:00,"736 14th St, New York City, NY 10001",2019,8,15,3,7.68,New York City (NY)
...,...,...,...,...,...,...,...,...,...,...,...,...
185945,319666,Lightning Charging Cable,1,14.95,2019-12-11 20:58:00,"14 Madison St, San Francisco, CA 94016",2019,12,11,2,14.95,San Francisco (CA)
185946,319667,AA Batteries (4-pack),2,3.84,2019-12-01 12:01:00,"549 Willow St, Los Angeles, CA 90001",2019,12,1,6,7.68,Los Angeles (CA)
185947,319668,Vareebadd Phone,1,400.00,2019-12-09 06:43:00,"273 Wilson St, Seattle, WA 98101",2019,12,9,0,400.00,Seattle (WA)
185948,319669,Wired Headphones,1,11.99,2019-12-03 10:39:00,"778 River St, Dallas, TX 75001",2019,12,3,1,11.99,Dallas (TX)


In [15]:
# group all the same products purchased on the same date together
grouped_data = sales_data.groupby(['month', 'day', 'Product'])['Quantity Ordered'].sum().reset_index()

grouped_data


Unnamed: 0,month,day,Product,Quantity Ordered
0,1,1,20in Monitor,4
1,1,1,27in 4K Gaming Monitor,7
2,1,1,27in FHD Monitor,8
3,1,1,34in Ultrawide Monitor,18
4,1,1,AA Batteries (4-pack),33
...,...,...,...,...
6807,12,31,ThinkPad Laptop,26
6808,12,31,USB-C Charging Cable,103
6809,12,31,Vareebadd Phone,10
6810,12,31,Wired Headphones,90


In [18]:
product_id = {}
id = 1

for item in grouped_data['Product']:
    if item not in product_id:
        product_id[item] = id
        id += 1

grouped_data['product_id'] = grouped_data['Product'].map(product_id)


In [24]:
# how to manage product data and train it

X = grouped_data[['product_id', 'month', 'day']]
y = grouped_data['Quantity Ordered']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

In [31]:
X_train

Unnamed: 0,product_id,month,day
1677,7,4,1
2982,19,6,9
118,6,1,7
2699,16,5,25
3145,14,6,18
...,...,...,...
5699,16,11,2
2550,19,5,17
537,4,1,30
1220,18,3,7


In [32]:
# fit model no training data
model = XGBRegressor()
model.fit(X_train, y_train)


In [33]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]


In [35]:
from sklearn.metrics import accuracy_score
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#TRY DIFFERENT MODELS, NN ETC

Accuracy: 9.16%
