In [None]:
import numpy as np
import pandas as pd
import string
import re
from glob import glob
from functools import reduce
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import fetcher as ft
import slreg as slr


sno = SnowballStemmer('english')
wnl = WordNetLemmatizer()


def process_string(s):
    processed = (s.str.strip()
                  .str.lower()
                  .str.replace(r'[\t\n\r\f\v]', r'')
                  .str.replace(r'\d+', r'')
                  # capture commas followed by any number of whitespaces
                  .str.replace(r', *', r' ')
                  .apply(lambda s: s.translate(str.maketrans({x: None for x in string.punctuation}))
                         if type(s) == str else '')
                  # apply SnowballStemmer then WordNetLemmatizer to singularize missed words
                  .apply(lambda s: ' '.join(set([wnl.lemmatize(sno.stem(x)) for x in re.split(r' +', s)
                                                 if x not in stopwords.words('english')]))
                         if type(s) == str else ''))
    return processed


def n_common_terms(string, term_list):
    return len(set(string.split(' ')) & set(term_list))


def compatibility(s, term_list, func='mean'):
    func={'mean': pd.Series.mean,
          'max': pd.Series.max,
          'min': pd.Series.min}[func]
    comp = (s.str.split('|')
            .apply(lambda ls:
                   pd.Series(map(lambda x: n_common_terms(x, term_list), ls))))
    comp = func(comp, axis=1)
    return comp

In [None]:
inqs = pd.read_csv('C:/Users/2093/Desktop/Data Center/03. Data/05. TAITRA/FCRM/data/inquiry_compressed_with_country.csv',
                   parse_dates=True, index_col='creation_date', encoding='utf-8',
                   dtype={'code_val': str, 'dept': 'category'})
ctlg = pd.read_csv('C:/Users/2093/Desktop/Data Center/03. Data/05. TAITRA/TT/processed_ctlg.csv',
                   index_col='ban', parse_dates=['mod_date'], dtype={'code_val': str})
ex = pd.read_csv('C:/Users/2093/Desktop/Data Center/03. Data/05. TAITRA/TT/export_compressed.csv',
                 index_col='ban', dtype={'ban': str})

In [None]:
def compute_features(inq):
    """Return DataFrame of calculated features.
    
    Parameters
    ----------
    inq : Series
        A single buyer inquiry.
        
    Returns
    -------
    features : DataFrame
        Currently there are 28 features:
        
        1) ``n_items`` : number of items in the supplier's catalogue.
        
        2) ``{type_}comp_{combination}`` : product compatibility. There are three types (total, max,
           min) and eight possible combinations.
        
        3) ``recency`` : date difference between last modified date and today.
        
        4) ``n_comms`` : number of unique commodities exported by the supplier (with non-empty
           description).
        
        5) ``isexporter`` : whether the supplier has shipped to buyer's country in recent years.
    """
    
    code, prod, desc, ctry = inq['code_val'], inq['prod_name'], inq['prod_desc'], inq['buyer_country']
    prod_ls = process_string(pd.Series(prod))[0].split(' ')
    desc_ls = process_string(pd.Series(desc))[0].split(' ')
    supp = ft.fetch_suppliers(ctlg, code)
    bans = supp.index
    export = ft.fetch_export(ex, bans, ctry)
    supp = pd.concat([supp, export], axis=1)
    
    comp_ls = [lambda c: compatibility(c, prod_ls, 'mean'),
               lambda c: compatibility(c, prod_ls, 'max'),
               lambda c: compatibility(c, prod_ls, 'min'),
               lambda c: compatibility(c, desc_ls, 'mean'),
               lambda c: compatibility(c, desc_ls, 'max'),
               lambda c: compatibility(c, desc_ls, 'min')]
    
    features = supp.transform(dict(
        n_items=lambda c: c,
        item_name=comp_ls,
        item_desc=comp_ls,
        keyword=comp_ls,
        recency=lambda c: c,
        n_comms=lambda c: c,
        comm_name=comp_ls,
        isexporter=lambda c: c
        )
    )

    features = features.swaplevel(axis=1)
    features.columns = features.columns.droplevel()
    features.columns = ['n_items',
                        'comp_ip', 'max_comp_ip', 'min_comp_ip',  # item vs prod
                        'comp_id', 'max_comp_id', 'min_comp_id',  # item vs desc
                        'comp_dp', 'max_comp_dp', 'min_comp_dp',  # desc vs prod
                        'comp_dd', 'max_comp_dd', 'min_comp_dd',  # desc vs desc
                        'comp_kp', 'max_comp_kp', 'min_comp_kp',  # keyword vs prod
                        'comp_kd', 'max_comp_kd', 'min_comp_kd',  # keyword vs desc
                        'recency', 'n_comms',
                        'comp_hp', 'max_comp_hp', 'min_comp_hp',  # hs vs prod
                        'comp_hd', 'max_comp_hd', 'min_comp_hd',  # hs vs desc
                        'isexporter']
    return features


def estimate_feature_dist(inqs):
    """Estimate and save population mean and standard deviation for each feature.
    
    Parameters
    ----------
    inqs : DataFrame
        Each row represents an inquiry.
    """
    
    feature_map = map(compute_features, [row for lab, row in inqs.iterrows()])
    stacked = reduce(lambda x, y: x.append(y), feature_map)
    mean, std = stacked.mean(), stacked.std()
    dist = pd.concat([mean, std], axis=1)
    dist.columns = ['mean', 'std']
    dist.to_csv('feature_distribution.csv')
    return


def normalize_features(X):
    """Return normalized features."""
    return (X - mean) / std


# Some useful values
n = 14
alpha = 0.01
dist = pd.read_csv('feature_distribution.csv', index_col=0)
mean, std = dist['mean'], dist['std']

# For each incoming inquiry Series ``inq``, run:
# ================================================================
# Load current theta or initialize if not exists
if len(glob('theta.txt')):
    with open('theta.txt', 'r') as f:
        theta = np.array([float(x) for x in f.read().split('\n')])
else:
    theta = np.zeros(n + 1)

# Get data ready
X = compute_features(inq)
X = normalize_features(X)
X['intercept'] = 1

# Predict probabilities
X['prob'] = slr.predict_prob(X, theta)

# Get top 10 suppliers
top10 = X.sort_values('prob', ascending=False).drop('prob', axis=1).head(10)

# Fetch user response Series ``y``

# Save y together with ``inq`` (broadcasted) and 10 ``BAN_REAL``s

# Update theta using 10 steps of gradient descent
for i in range(10):
    x = top10.iloc[i].values.reshape((1, n))
    theta, J = slr.gradient_descent(x, y[[i]], theta, alpha)

with open('theta.txt', 'w') as f:
    f.write('\n'.join([str(x) for x in theta]))
# ================================================================