# Data Preparation and Persistence
---
In this notebook, csv data is prepared and stored in a database for persistence.

### Transactions_By_Dept Data
---
###### source: https://data.world/zpencer/transaction-itemset

In [95]:
import pandas as pd
from sqlalchemy import create_engine, BigInteger, Integer, String

# Pull data from csv to DataFrame
transactions_by_dept_path = 'data/transactions_by_dept.csv'
transactions_df = pd.read_csv(transactions_by_dept_path)
transactions_df.columns = ['TransactionId', 'Department','ItemId','SalesUnits']

# Map TransactionId to smaller values
transaction_id_map = pd.DataFrame(transactions_df['TransactionId'].unique(), columns=['TransactionId'])
transaction_id_map['Id'] = transaction_id_map.index
transactions_df = transactions_df.merge(transaction_id_map, on='TransactionId') \
                                 .drop('TransactionId',axis=1) \
                                 .rename(columns={'Id': 'TransactionId'}) \
                                 .set_index(['TransactionId','ItemId'])

# Data is now ready to be written to database for persistence

Unnamed: 0_level_0,Unnamed: 1_level_0,Department,SalesUnits
TransactionId,ItemId,Unnamed: 2_level_1,Unnamed: 3_level_1
0,250,0261:HOSIERY,2
0,102,0634:VITAMINS & HLTH AIDS,1
0,158,0879:PET SUPPLIES,2
0,175,0973:CANDY,2
0,176,0982:SPIRITS,1


In [97]:
# Establish a connection
host = 'localhost'
database = 'data'
driver = 'SQL+Server+Native+Client+11.0' # Driver found in ODBC Data Source Administrator app
engine = create_engine("mssql+pyodbc://{0}/{1}?driver={2}".format(host,database,driver))

# Specify schema and table
schema = 'dbo'
table_name = 'TransactionsByDept'

# Define data types (not always necessary)
data_types = {'TransactionId': BigInteger,
              'ItemId': BigInteger,
              'Department': String,
              'SalesUnits': BigInteger}

# Write dataframe to a table
transactions_df.to_sql(table_name,con=engine,schema=schema,
                       index=True, index_label=['TransactionId','ItemId'],
                       if_exists='replace',dtype=data_types)

### Extended_Bakery Data
---
###### source: https://github.com/Stocco/Data-Mining-the-Extended-Bakery

In [None]:
extended_bakery_path = 'data/75000-out1.csv'

import pandas as pd


### Order_Products Data
---
###### source: https://www.kaggle.com/c/instacart-market-basket-analysis/data

In [None]:
order_products_prior_path = 'data/order_products__prior.csv'
order_products_train_path = 'data/order_products__train.csv'
products_path = 'data/products.csv'