**Business Problem**

New users on Airbnb can book a place to stay in 34,000+ cities across 190+ countries. By accurately predicting where a new user will book their first travel experience, Airbnb can share more personalized content with their community, decrease the average time to first booking, and better forecast demand.

**Data Description**

In this challenge, you are given a list of users along with their demographics, web session records, and some summary statistics. You are asked to predict which country a new user's first booking destination will be. All the users in this dataset are from the USA.

There are 12 possible outcomes of the destination country: 'US', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU', 'NDF' (no destination found), and 'other'. Please note that 'NDF' is different from 'other' because 'other' means there was a booking, but is to a country not included in the list, while 'NDF' means there wasn't a booking.

The training and test sets are split by dates. In the test set, you will predict all the new users with first activities after 7/1/2014 (note: this is updated on 12/5/15 when the competition restarted). In the sessions dataset, the data only dates back to 1/1/2014, while the users dataset dates back to 2010. 




**Solution**
1. Predict and load in a database, use data visualization to see results

2. API: 
    * Input: id and customer features
    * Output: id + predict + probability

In [18]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn import model_selection as ms
from sklearn import preprocessing as pp
from imblearn.over_sampling import BorderlineSMOTE
import joblib

from sklearn.ensemble import ExtraTreesClassifier
pd.set_option('mode.chained_assignment', None)

class Airbnb:

    def load_data(self):
        df_raw = pd.read_csv('../data/train_users_2.csv', low_memory=True)
        df_sessions = pd.read_csv('../data/sessions.csv')
        return df_raw, df_sessions


    def transform_data(self, df, df_sessions):
        #==================Training================

        #age
        age_mean = df['age'].mean()
        df['age'] = df['age'].fillna(age_mean)

        # first_affiliate_tracked
        df['first_affiliate_tracked'].dropna(inplace=True)

        #==================Sessions==============
        df_sessions.dropna(inplace = True)

        #date_account_created
        df['date_account_created'] = pd.to_datetime(df['date_account_created'])

        #timestamp_first_active
        df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'], format='%Y%m%d%H%M%S')
        # date_first_booking - not available
        df.drop('date_first_booking', axis = 1, inplace = True)

        #age
        df['age'] = df['age'].astype('int64')

        #Filter
        df = df[(df['age']>15) & (df['age']<100)]

        return df, df_sessions


    def feature_engineering(self, df, df_sessions):

        df['first_active'] = pd.to_datetime(df['timestamp_first_active'].dt.strftime('%Y-%m-%d'))

        #time between account created and first active
        df['days_from_active_to_account_created'] = (df['date_account_created'] - df['first_active']).dt.days

        #year  of first active
        df['year_first_active'] = df['first_active'].dt.year

        #month of first active
        df['month_first_active'] = df['first_active'].dt.month

        #day of first active
        df['day_first_active'] = df['first_active'].dt.day

        #day of week of first active
        df['day_of_week_first_active'] = df['first_active'].dt.dayofweek

        #week of year of first active
        df['week_of_year_first_active'] = df['first_active'].dt.isocalendar().week


        #year  of account created
        df['year_account_created'] = df['date_account_created'].dt.year

        #month of account created
        df['month_account_created'] = df['date_account_created'].dt.month

        #day of account created
        df['day_account_created'] = df['date_account_created'].dt.day

        #day of week of account created
        df['day_of_week_account_created'] = df['date_account_created'].dt.dayofweek

        #week of year of account created
        df['week_of_year_account_created'] = df['date_account_created'].dt.isocalendar().week

        # n_clicks
        n_clicks = df_sessions[df_sessions['action_type']=='click'].groupby('user_id').agg(n_clicks = ('user_id', 'count')).reset_index()
        df = pd.merge(df, n_clicks.rename(columns = {'user_id' : 'id'}), on ='id', how='left')
        df['n_clicks'].fillna(0, inplace=True)

        n_reviews = df_sessions[df_sessions['action']=='reviews'].groupby('user_id').agg(n_reviews = ('user_id', 'count')).reset_index()
        df = pd.merge(df, n_reviews.rename(columns = {'user_id' : 'id'}), on ='id', how='left')
        df['n_reviews'].fillna(0, inplace=True)

        return df

    def data_preprocessing(self, df):
        #dummy variable - signup_method
    #     dummy = pd.get_dummies(df['signup_method'])
    #     df = pd.concat([df, dummy], axis=1)

        # language to binary, either is english or not
        df['language_en'] = np.where(df['language']=='en', 1, 0)

        # signup to binary, either is web or not
        df['signup_on_web'] = np.where(df['signup_app']== 'Web', 1, 0)

        # first_affiliate_tracked to binary, either is tracked or not
        df['tracked'] = np.where(df['first_affiliate_tracked']=='untracked', 0, 1)

        #binary features from first_device_type
        df['first_device_apple'] = np.where(df['first_device_type'].isin(['Mac Desktop', 'iPhone', 'iPad']), 1 ,0)
        df['first_device_desktop'] = np.where(df['first_device_type'].isin(['Mac Desktop', 'Desktop', 'Windows Desktop']), 1, 0)

        # frequency encoding
        affiliate_channel_frequency_encoding = df['affiliate_channel'].value_counts(normalize=True)
        df['affiliate_channel'] = df['affiliate_channel'].map(affiliate_channel_frequency_encoding)

        affiliate_provider_frequency_encoding = df['affiliate_provider'].value_counts(normalize=True)
        df['affiliate_provider'] = df['affiliate_provider'].map(affiliate_provider_frequency_encoding)

        first_browser_frequency_encoding = df['first_browser'].value_counts(normalize=True)
        df['first_browser'] = df['first_browser'].map(first_browser_frequency_encoding)

        # Rescaling
        columns_to_rescale = [
        "age",
        "signup_flow",
        "n_reviews",
        "n_clicks"
        ]

        scaler = pp.MinMaxScaler()

        df[columns_to_rescale] = scaler.fit_transform(df[columns_to_rescale])

        # temporal columns 
        temporal_columns = [
        "days_from_active_to_account_created",
        "year_first_active",
        "month_first_active",
        "day_first_active",
        "day_of_week_first_active",
        "week_of_year_first_active",
        "year_account_created",
        "month_account_created",
        "day_account_created",
        "day_of_week_account_created",
        "week_of_year_account_created"]

         # month_account_created
        df['month_account_created_sin'] = df['month_account_created'].apply( lambda x: np.sin( x * (2*np.pi/12 ) ) )
        df['month_account_created_cos'] = df['month_account_created'].apply( lambda x: np.cos( x * (2*np.pi/12 ) ) )

        # week_account_created
        df['week_account_created_sin'] = df['week_of_year_account_created'].apply( lambda x: np.sin( x * (2*np.pi/52 ) ) )
        df['week_account_created_cos'] = df['week_of_year_account_created'].apply( lambda x: np.cos( x * (2*np.pi/52 ) ) )

        # day_account_created
        df['day_account_created_sin'] = df['day_account_created'].apply( lambda x: np.sin( x * (2*np.pi/30 ) ) )
        df['day_account_created_cos'] = df['day_account_created'].apply( lambda x: np.cos( x * (2*np.pi/30 ) ) )

        # day_of_week_account_created
        df['day_of_week_account_created_sin'] = df['day_of_week_account_created'].apply( lambda x: np.sin( x * (2*np.pi/7 ) ) )
        df['day_of_week_account_created_cos'] = df['day_of_week_account_created'].apply( lambda x: np.cos( x * (2*np.pi/7 ) ) )


        df.drop(temporal_columns, axis=1, inplace=True)


        cols_drop = [ 'gender', 'signup_method', 'language', 'first_affiliate_tracked',
               'signup_app', 'first_device_type','date_account_created', 'timestamp_first_active',
                      'first_active'] #original dates
        df = df.drop(cols_drop, axis=1)
        return df

    def balance_data(self, X_imb, y_imb):

        majority_value = y.value_counts()[0]

        # Calcular os pesos de classe inversamente proporcionais à frequência
        class_weights = {'NDF': majority_value*1,
                         'US': int(majority_value*0.5),
                         'other': int(majority_value*0.18),
                         'FR': int(majority_value*0.15) ,
                         'IT': int(majority_value*0.13),
                         'GB': int(majority_value*0.13),
                         'ES': int(majority_value*0.13),
                         'CA': int(majority_value*0.13),
                         'DE': int(majority_value*0.13),
                         'NL': int(majority_value*0.09),
                         'AU': int(majority_value*0.09),
                         'PT': int(majority_value*0.09)
                        }

        # Instanciar o SMOTE com os pesos de classe definidos
        smote = BorderlineSMOTE(sampling_strategy=class_weights)

        # Aplicar o resampling usando o SMOTE
        X_resampled, y_resampled = smote.fit_resample(X, y)

        return X_resampled, y_resampled

    def train_model(self, model, X, y):
        
        X_train, X_test, y_train, y_test = ms.train_test_split(X, y, train_size = 0.8, random_state=42)
        
        model.fit(X_train, y_train)
        
        return model
    
    def dump_model(self, model):
        
        if not os.path.exists('../models/'):
            os.mkdir('../models/')
            
        pickle.dump(model, open('../models/extratrees.pkl', 'wb'))
        joblib.dump( model, "../models/extratrees.joblib", compress=3 ) 

    def load_model(self):
        
        joblib.dump( model_rf, "../model/full_rf_model_compressed.joblib", compress=3 ) 
        print( f"Random Forest full size: {np.round(ops.path.getsize('../model/full_rf_model_compressed.joblib') / 1024 / 1024, 2) } MB")

    
    def predict(self, model, X_test):
        
        return model.predict_proba(X_test)

### Pipeline - Train and load model

In [19]:
pipeline = Airbnb()

df_raw, df_sessions = pipeline.load_data()

df, df_sessions = pipeline.transform_data(df_raw, df_sessions)

df = pipeline.feature_engineering(df, df_sessions)


KeyboardInterrupt



In [None]:
df= pipeline.data_preprocessing(df)

In [87]:
X = df.drop(['country_destination', 'id'], axis=1)
y = df['country_destination']

X_balanced, y_balanced = pipeline.balance_data(X, y)

extra_trees = ExtraTreesClassifier(n_estimators=10)

extra_trees = pipeline.train_model(extra_trees, X_balanced, y_balanced)

pipeline.dump_model(extra_trees)

### Predict

In [20]:
import json

pipeline = Airbnb()

model = joblib.load('../models/extratrees.joblib')

In [21]:
#### input test
test = pd.read_csv('../data/test_users.csv')
json_data = test.head().to_json(orient='records')


In [22]:
json_data = json.loads(json_data)

In [23]:
if json_data:
    df_raw = pd.DataFrame(json_data)

In [24]:
_, df_sessions = pipeline.load_data()

In [None]:
df, df_sessions = pipeline.transform_data(df_raw, df_sessions)

df = pipeline.feature_engineering(df, df_sessions)

df= pipeline.data_preprocessing(df)
X = df.drop('id', axis=1)

In [32]:
predicted_country_destinations_proba = pipeline.predict(model, X)

classes = model.classes_
predicted_classes = np.argmax(predicted_country_destinations_proba, axis=1)
class_proba = np.max(predicted_country_destinations_proba, axis=1)
predicted_class_names = [classes[i] for i in predicted_classes]

df['country_destination'] = predicted_class_names
df['proba'] = class_proba
