## Data required by recomender
> in the format of: user item rating; tab deliminated, withut header

> train/test: MinMaxScalling or not; Number of purchases or Total spending in Purchases; log trascormation or not;

16 output files with naming convension: 
     >   [scale_,][train_, test_][num, dol][_log,].txt
     

In [2]:
""" Retriving data from mongodb"""
import pandas as pd
from pymongo import MongoClient

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return(conn[db])

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
    cursor = db[collection].find(query)
    df =  pd.DataFrame(list(cursor))
    # Delete the _id
    if no_id:
        del df['_id']
    return(df)

orders = read_mongo("BT4221_DB", "Orders")
print(len(orders)) 
orders.sample(2)

13382011


Unnamed: 0,HH_ID,CompanyID,OrderNum,OrderDate,Dollars,PaymentType,Channel
6467378,763686,744,4696493,20050322,158,H,C
12437255,2166067,54,13874484,20070521,160,,O


In [7]:
"""training data, first 30 months, HH_ID-CompanyID level data"""
train = orders[orders.OrderDate<=20070630].groupby(by=["HH_ID", "CompanyID"]).agg({"OrderNum": 'nunique', 'Dollars': 'sum'})
train.reset_index(inplace=True)
print(len(train))
train.sample(1)

5157084


Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars
4138108,1682436,734,2,77


In [11]:
"""rating as log() """
import numpy as np
train["logDollar"] = np.log(train.Dollars)
train["logOrder"] = np.log(train.OrderNum)
train.sample(1)

Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars,logDollar,logOrder
3849536,1480751,798,1,156,5.049856,0.0


In [15]:
"""MinMax Scalling"""
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 5))
cols = ["OrderNum","Dollars","logOrder","logDollar"]
scale_train = train.copy()
scale_train[cols] = scaler.fit_transform(scale_train[cols])
scale_train.sample(1)

Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars,logDollar,logOrder
2781550,925745,742,1.029126,1.007652,2.983882,2.293506


In [18]:
""" save as format required by recommender library"""
for col in ["OrderNum","Dollars","logOrder","logDollar"]:
    cols = ["CompanyID","HH_ID", col]
    measure = "dol" if "Dollar" in col else "num"
    transformation = "_log" if "log" in col else ""
    file_name = "train_" + measure + transformation + ".txt"
    print(file_name)
    train[cols].to_csv(file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )
    scale_train[cols].to_csv("scale_"+file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )

train_num.txt
train_dol.txt
train_num_log.txt
train_dol_log.txt


In [19]:
""" Customers contributing to company 36's revenue in testing period """
company = 36
test = orders[(orders.OrderDate>20070630) & (orders.CompanyID==company)].groupby(by=["HH_ID", "CompanyID"]).agg({"OrderNum": 'nunique', 'Dollars': 'sum'})
test.reset_index(inplace=True)
print(len(test))
test.sample(1)

82328


Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars
33283,864034,36,5,553


In [41]:
""" Recomended Customers """
recommended_hh = pd.DataFrame({'HH_ID': list(set(train.HH_ID)-set(train[train.CompanyID==company].HH_ID))})
test_output  = pd.merge(recommended_hh,test,how='left', on = ["HH_ID"])
test_output["HH_ID"] = test_output["HH_ID"].astype(int).astype('str')
test_output["CompanyID"] = str(company)
test_output = test_output.fillna(0).sort_values(by="Dollars",ascending=False).reset_index(drop=True)
print(len(test_output))
test_output.head(3)

1950780


Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars
0,1913674,36,18.0,2323.0
1,1240677,36,4.0,1718.0
2,140008,36,3.0,1155.0


In [47]:
"""rating as log(measure+1) """
import numpy as np
test_output["logDollar"] = np.log(test_output.Dollars+1) # To prevent -inf from appearing
test_output["logOrder"] = np.log(test_output.OrderNum+1)
test_output.sample(1)

Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars,logDollar,logOrder
220296,1754861,36,0.0,0.0,0.0,0.0


In [48]:
"""MinMax Scalling """
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 5))
cols = ["OrderNum","Dollars","logOrder","logDollar"]
scale_test_output = test_output.copy()
scale_test_output[cols] = scaler.fit_transform(scale_test_output[cols])
scale_test_output.sample(1)

Unnamed: 0,HH_ID,CompanyID,OrderNum,Dollars,logDollar,logOrder
1843894,883680,36,1.0,1.0,1.0,1.0


In [52]:
""" save to format required by recommender library"""
for col in ["OrderNum","Dollars","logOrder","logDollar"]:
    cols = ["CompanyID","HH_ID", col]
    measure = "dol" if "Dollar" in col else "num"
    transformation = "_log" if "log" in col else ""
    file_name = "test_" + measure + transformation  + ".txt"
    print(file_name)
    test_output[cols].to_csv(file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )
    scale_test_output[cols].to_csv("scale_"+file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )

test_num.txt
test_dol.txt
test_num_log.txt
test_dol_log.txt


In [53]:
""" save as format required by recommender library"""
for col in ["OrderNum","Dollars","logOrder","logDollar"]:
    cols = ["CompanyID","HH_ID", col]
    measure = "dol" if "Dollar" in col else "num"
    transformation = "_log" if "log" in col else ""
    file_name = "train_" + measure + transformation + ".txt"
    print(file_name)
    train[cols].to_csv(file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )
    scale_train[cols].to_csv("scale_"+file_name, index=False, header=False,sep="\t", quoting=csv.QUOTE_NONE )

train_num.txt
train_dol.txt
train_num_log.txt
train_dol_log.txt
