## Для загрузки датасетов из kaggle по API необходимо положить файл kaggle.json в каталог пользователя
команды ниже копируют json в каталог .kaggle, присваивают доступ на чтение и запись владельцу файла

In [9]:
!mkdir -p ~/.kaggle
!cp ~/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Импорт библиотек и скачивание архива с датасетом

In [8]:
import kaggle.cli
import sys
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
from datetime import date, datetime

dataset = "vivek468/superstore-dataset-final"
sys.argv = [sys.argv[0]] + f"datasets download {dataset}".split(" ")
kaggle.cli.main()

zfile = ZipFile(f"{dataset.split('/')[1]}.zip")
zfile.extractall() 
zfile.close()

data = pd.read_csv('superstore-dataset-final.zip', encoding_errors='ignore')

Downloading superstore-dataset-final.zip to /Users/alexey/Documents/Repositories/GitHub/search_centriods


100%|██████████| 550k/550k [00:01<00:00, 533kB/s]





## Приводим поля с датами к типу date
Отобразим несколько строк датасета и его описание

In [10]:
data.columns = map(str.lower, data.columns)
data.columns = data.columns.str.replace(' ', '_')

data['order_date'] = data['order_date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date()).astype('datetime64[ns]')
data['ship_date'] = data['ship_date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date()).astype('datetime64[ns]')

display(data.head(3))
data.describe()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714


Unnamed: 0,row_id,postal_code,sales,quantity,discount,profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


## Проставим широту и долготу, зная город и страну

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
# location = geolocator.geocode('Los Angeles, United States')
# print(location.address, location.latitude, location.longitude)
data['latitude'] = data.apply(lambda x: geolocator.geocode(x['city'], x['country']).latitude, axis=1)
data['longitude'] = data.apply(lambda x: geolocator.geocode(x['city'], x['country']).longitude, axis=1)




## Сохраним обработанные данные

In [None]:
display(data.head(3))
data.to_csv('superstore-dataset-processed.csv') 