In [90]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

In [87]:
tqdm.pandas()

# 1 - Validation framework

In [2]:
test_customers = pd.read_csv('../data/test_customers.csv')
train_customers = pd.read_csv('../data/train_customers.csv')
orders = pd.read_csv('./data-processed/processed_orders.csv', index_col=0)
vendors = pd.read_csv('../data/vendors.csv')

In [3]:
orders.head()

Unnamed: 0,customer_id,latitude,longitude,vendor_id
0,FIC3GGQ,0.661219,0.426691,78
1,FIC3GGQ,0.661219,0.426691,78
2,FIC3GGQ,0.661219,0.426691,78
3,KOX4A1S,0.483372,0.560075,78
4,KOX4A1S,0.483372,0.560075,78


## Train and test data exploring

In [4]:
test_ids = test_customers['akeed_customer_id']
train_ids = train_customers['akeed_customer_id']

Number of train customers

In [5]:
len(train_ids)

34674

Number of unique train customers

In [6]:
len(set(train_ids))

34523

Number of test customers

In [7]:
len(test_ids)

9768

Number of unique test customers

In [8]:
len(set(test_ids))

9753

Numer of customers duplicated in train and test sets

In [9]:
len(set(train_ids).intersection(test_ids))

0

Number of orders

In [10]:
len(orders)

95060

Number of orders for train customers

In [11]:
len(orders[orders['customer_id'].isin(train_ids)])

92732

Number of orders for test customers

In [12]:
len(orders[orders['customer_id'].isin(test_ids)])

0

We do no have data for test user, so we need to do new train test split.  
But since we are going to reccomend restaurant for user at given place of orderign and we ignore user data, we will use only orders data.  
So we need to split orders data that we have.

## Recommendations framework  
Code of classes available in ./src.py

Because our input dimenson is location, we will aggregate individual location points to biggest points being centers of grid put on locations space.

In [110]:
def get_grid(
    orders: pd.DataFrame,
    x_n_points: int,
    y_n_points: int
) -> pd.DataFrame:

    max_x = orders['longitude'].max()
    min_x = orders['longitude'].min()
    max_y = orders['latitude'].max()
    min_y = orders['latitude'].min()

    x_diff = (max_x - min_x) / x_n_points
    y_diff = (max_y - min_y) / y_n_points

    x_range = np.linspace(min_x, max_x, x_n_points)
    y_range = np.linspace(min_y, max_y, y_n_points)

    x = []
    y = []

    for i in x_range:
        for j in y_range:
            x.append(i)
            y.append(j)

    grid_points = pd.DataFrame({'longitude': x, 'latitude': y})
    grid_points = grid_points.reset_index().rename(columns={'index': 'id'})

    return grid_points, x_diff, y_diff

In [111]:
grid_points, x_diff, y_diff = get_grid(orders, 30, 30)

In [112]:
grid_points.head()

Unnamed: 0,id,longitude,latitude
0,0,-0.319538,-2.880815
1,1,-0.319538,-2.641947
2,2,-0.319538,-2.403079
3,3,-0.319538,-2.164211
4,4,-0.319538,-1.925343


In [113]:
fig = go.Figure()

sub_orders = orders.sample(1000)
fig.add_trace(
    go.Scatter(
        x=sub_orders['longitude'],
        y=sub_orders['latitude'],
        mode='markers',
        name='Orders'
    )
)

fig.add_trace(
    go.Scatter(
        x=grid_points['longitude'],
        y=grid_points['latitude'],
        name='Grid points',
        mode='markers'
    )
)

fig.update_layout(
    title='Grid points'
)
fig.update_xaxes(range=[-0.5, 1])
fig.update_yaxes(range=[-3, 4.2])

fig.show()

In [80]:
def find_nearest_point(
    location: Tuple[float],
    points: pd.DataFrame,
    x_diff: float,
    y_diff: float
) -> int:

    x = location[0]
    y = location[1]

    points = points[(points['longitude'] < x + 2 * x_diff) & (points['longitude'] > x - 2 * x_diff)]
    points = points[(points['latitude'] < y + 2 * y_diff) & (points['latitude'] > y - 2 * y_diff)]

    points['distance'] = points.apply(lambda row: np.sqrt((row['longitude'] - x)**2 + (row['latitude'] - y)**2), axis=1)
    points = points.sort_values('distance')

    return points.iloc[0]['id']

In [119]:
grid_points, x_diff, y_diff = get_grid(orders, 100, 100)

In [120]:
orders['point'] = orders.progress_apply(
    lambda row: find_nearest_point(
        (row['longitude'], row['latitude']),
        grid_points, x_diff, y_diff
    ),
    axis=1
)

100%|██████████| 95060/95060 [05:51<00:00, 270.61it/s]


In [121]:
orders['point'] = orders['point'].astype('int')
orders.head()

Unnamed: 0,customer_id,latitude,longitude,vendor_id,point
0,FIC3GGQ,0.661219,0.426691,78,5651
1,FIC3GGQ,0.661219,0.426691,78,5651
2,FIC3GGQ,0.661219,0.426691,78,5651
3,KOX4A1S,0.483372,0.560075,78,6748
4,KOX4A1S,0.483372,0.560075,78,6748


In [130]:
px.violin(orders, y='point', title='Distribution of orders number for aggegated points', box=True)

In [129]:
to_plot = orders[['point', 'vendor_id']].groupby('point').nunique().reset_index().rename(columns={'vendor_id': 'vendors_number'})

px.violin(
    to_plot, y='vendors_number', box=True,
    title="NDistribution of unique vendors number for aggregated points"
)

Number of unique order locations

In [132]:
orders['place'] = orders.apply(lambda x: (x['longitude'], x['latitude']), axis=1)
len(orders['place'].unique())

28623

Number of aggregated pooints with assigned orders

In [133]:
len(orders['point'].unique())

1792

Then we will predict recommendations for each agregated point and validate predictions using orders data from individual location point assigned to that aggregated point.  
During validation we will ignore aggreagted points with no assigned orders.

In [79]:
from abc import ABC, abstractmethod
from typing import Tuple, List
from sklearn.model_selection import train_test_split

# TODO

class Model(ABC):

    def __init__(
        self,
        vendors: pd.DataFrame
    ) -> None:
        self.vendors = vendors
        self.is_fitted = False

    @abstractmethod
    def fit(
        self,
        train_orders: pd.DataFrame, # columns: point, vendor_id
        points: pd.DataFrame # columns: id, x, y 
    ) -> None:
        pass

    @abstractmethod
    def get_ranking(
        self,
        location: Tuple[float, float]
    ) -> List[int]:
        pass

    def predict(
        location,
        n_recomendations
    ) -> List[int]:

        if not self.is_fitted:
            raise Exception('First fit the model!')

        ranking = self.get_ranking(location)
        return ranking[0:n_recomendations]


In [23]:
class Validator:

    def __init__(
        self,
        model: Model,
        orders: pd.DataFrame,
        vendors: pd.DataFrame,
        grid_x_n_points: int,
        grid_y_n_points: int,
        test_size: float = 0.3,
        random_seed = 123
    ):
        self.model = model
        
        # make grid
        grid_points, x_diff, y_diff = get_grid(
            orders,
            grid_x_n_points,
            grid_y_n_points
        )

        # find nearest point for orders
        orders['point'] = orders.progress_apply(
            lambda row: find_nearest_point(
                (row['longitude'], row['latitude']),
                grid_points, x_diff, y_diff
            ),
            axis=1
        )

        # split points for test and train sets

        # fit model


    def validate(
        self,
        n_recomendations
    ):

        n_relevant_items = 0
        n_recommended_items = 0
        n_possible_relevant_items = 0

        for i in range(len(self.X_test)):

            X = self.X_test.iloc[i]
            y = self.y_test[i]