<a href="https://colab.research.google.com/github/gr3g-z/Data_Analysis/blob/main/StockX_Study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p align="center">
<img src = "https://upload.wikimedia.org/wikipedia/commons/9/95/Stockx_logo.png" width="15%"/>
</p>

# Stock X Data Analysis

## Objectives:
* What shoes are most popular?
* Which shoes have the best/worst profit margins?
* What factors affect profit margin?
* Is it possible to predict the sale price of a shoe at a given time?

### About Dataset:
- Currently the dataset consists of the single file of sales provided by StockX. ~10000 shoe sales from 50 different models (Nike x Off-White and Yeezy).

-----

In [1]:
# libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import warnings
warnings.filterwarnings('ignore')


In [4]:
path = '/content/drive/MyDrive/csv_docs/StockX-Data-Contest-2019-3.csv'

In [5]:
dataframe = pd.read_csv(path)

In [6]:
dataframe # raw data

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,"$1,097",$220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,$685,$220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,$690,$220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,"$1,075",$220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,$828,$220,2/11/17,11.0,Rhode Island
...,...,...,...,...,...,...,...,...
99951,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$565,$220,12/26/18,8.0,Oregon
99952,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$598,$220,12/26/18,8.5,California
99953,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$605,$220,12/26/18,5.5,New York
99954,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,$650,$220,12/26/18,11.0,California


In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99956 entries, 0 to 99955
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Order Date    99956 non-null  object 
 1   Brand         99956 non-null  object 
 2   Sneaker Name  99956 non-null  object 
 3   Sale Price    99956 non-null  object 
 4   Retail Price  99956 non-null  object 
 5   Release Date  99956 non-null  object 
 6   Shoe Size     99956 non-null  float64
 7   Buyer Region  99956 non-null  object 
dtypes: float64(1), object(7)
memory usage: 6.1+ MB


In [8]:
dataframe['Sale Price'] = dataframe['Sale Price'].str.replace('$', '')
dataframe['Sale Price'] = dataframe['Sale Price'].str.replace(',', '')

In [9]:
dataframe['Retail Price'] = dataframe['Retail Price'].str.replace('$', '')

In [10]:
dataframe

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2/11/17,11.0,Rhode Island
...,...,...,...,...,...,...,...,...
99951,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,565,220,12/26/18,8.0,Oregon
99952,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,598,220,12/26/18,8.5,California
99953,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,605,220,12/26/18,5.5,New York
99954,2/13/19,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,650,220,12/26/18,11.0,California


In [11]:
dataframe['Retail Price'] = dataframe['Retail Price'].astype(int)

In [12]:
dataframe['Sale Price'] = dataframe['Sale Price'].astype(int)

In [13]:
dataframe['Release Date'] = pd.to_datetime(dataframe['Release Date'], format='%m/%d/%y', errors='coerce')

In [14]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99956 entries, 0 to 99955
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Order Date    99956 non-null  object        
 1   Brand         99956 non-null  object        
 2   Sneaker Name  99956 non-null  object        
 3   Sale Price    99956 non-null  int64         
 4   Retail Price  99956 non-null  int64         
 5   Release Date  99956 non-null  datetime64[ns]
 6   Shoe Size     99956 non-null  float64       
 7   Buyer Region  99956 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 6.1+ MB


In [15]:
dataframe["Sale Price"].mean() #mean

446.63471927648163

In [16]:
df = dataframe.drop('Order Date', axis = 1)

-----

#### Dataframe limpo

In [17]:
df # clean dataset

Unnamed: 0,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,2016-09-24,11.0,California
1,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,2016-11-23,11.0,California
2,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,2016-11-23,11.0,California
3,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,2016-11-23,11.5,Kentucky
4,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2017-02-11,11.0,Rhode Island
...,...,...,...,...,...,...,...
99951,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,565,220,2018-12-26,8.0,Oregon
99952,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,598,220,2018-12-26,8.5,California
99953,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,605,220,2018-12-26,5.5,New York
99954,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,650,220,2018-12-26,11.0,California


In [18]:
fig = px.box(df, x = 'Brand', y="Sale Price", color = "Sneaker Name", width = 1500)
fig

In [19]:
fig = px.scatter(df, x = 'Sneaker Name', y="Sale Price", color = "Brand", height=800 )
fig

In [20]:
fig = px.box(df, x = 'Brand', y="Sale Price", color = "Retail Price")
fig

In [21]:
# mapa de calor com regioes de mais compra

### Previsão de Valores

In [22]:
df.head()

Unnamed: 0,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,2016-09-24,11.0,California
1,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,2016-11-23,11.0,California
2,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,2016-11-23,11.0,California
3,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,2016-11-23,11.5,Kentucky
4,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2017-02-11,11.0,Rhode Island


In [37]:
def fatorizar(df):
    obj = df.select_dtypes(['object']).columns
    df[obj] = df[obj].apply(lambda i : pd.factorize(i)[0])
    return df

In [39]:
aux = df['Sneaker Name']

In [25]:
# features dropar order e Buyer Region
# target resell price

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# X são os atributos que você acha relevantes para a predição (por exemplo, valores de venda)
# y será o alvo que queremos prever (valores de revenda)

X = df.drop(['Brand','Buyer Region','Retail Price','Sneaker Name'],axis =1)  # Exemplo, considere incluir mais atributos relevantes
y = df['Sale Price']

In [27]:
# Dividindo os dados em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Criando e treinando o modelo de regressão linear
#model = LinearRegression()
#model.fit(X_train, y_train)

# Realizando a predição
#y_pred = model.predict(X_test)

# Avaliando o desempenho do modelo
#mse = mean_squared_error(y_test, y_pred)
#print(f'MSE: {mse}')

# Agora você pode usar 'model' para fazer previsões para novos dados