In [1]:
import pandas as pd
import numpy as np

import pyarrow

from sklearn.svm import SVC

from pickle import dump
from pickle import load

## Load the data

In [2]:
df = pd.read_parquet('client_database.parquet', engine='pyarrow')

## Prepare data for prediction

In [3]:
# Gather all data cleaning operations in one function
def clean_data(df):

    # Drop unneeded columns
    df.drop(columns=['number', 'name', 'offer_code', 'customer_code'], inplace=True)
    
    # Replace age, offer_value NaN with the feature median
    df[['age']] = df[['age']].fillna(df.median())
    df[['offer_value']] = df[['offer_value']].fillna(df.median())
    df[['salary']] = df[['salary']].fillna(df.median())
    df[['phone_calls']] = df[['phone_calls']].fillna(df.median())
    df[['estimated_expenses']] = df[['estimated_expenses']].fillna(df.median())
    df[['emails']] = df[['emails']].fillna(df.median())
    
    # Abandon rows where there is NaN in the center, offer_code and customer_type columns
    df = df[~df.center.isna()]
    df = df[~df.customer_type.isna()]
    
    return df

In [4]:
from sklearn import preprocessing

# Gather all data preprocessing operations in one function
def preprocess_data(df):
    
    df_num = df.select_dtypes(include = ['float64'])
    df_cat = df[list(set(df.columns) - set(df_num.columns))]
    
    # Divide into binary and multi-category
    cat_binary = []
    cat_multi = []
    for feature in df_cat:
        if df_cat[feature].nunique() > 2:
            cat_multi.append(feature)
        else:
            cat_binary.append(feature)
            
    # Encode categorical features
    df['center'].replace(['B', 'A'], [1, 0], inplace=True)
    df['gender'].replace(['male', 'female'], [1, 0], inplace=True)
    df['accepted'].replace(['yes', 'no'], [1, 0], inplace=True)
    
    # One-hot-encoding
    df = pd.get_dummies(df, columns=cat_multi)
    
    # Re-order columns
    ordered_columns = ['accepted', 'gender', 'age', 'phone_calls', 'emails', 
                   'salary', 'offer_value', 'estimated_expenses', 'center', 
                   'customer_type_C', 'customer_type_Q', 'customer_type_S', 
                   'offer_class_High', 'offer_class_Medium', 'offer_class_Premium']
    df = df[ordered_columns]
    
    corr = df.corr()
    c = corr['accepted']
    
    # Remove columns not correlated enough with the target feature
    not_enough_threshold = 0.01
    c = c[c.abs() < not_enough_threshold]
    columns_to_drop = c.sort_values(ascending=False)
    df.drop(columns=list(columns_to_drop.index), inplace=True)
    
    # Rescale data between 0 and 1
    x = df.values #returns a numpy array
    scaler = preprocessing.MinMaxScaler()
    x_scaled = scaler.fit_transform(x)
    cols = df.columns
    df = pd.DataFrame(x_scaled, columns=cols)
    
    return df

In [5]:
df = clean_data(df)
df = preprocess_data(df)

## Load the model

In [6]:
# Load the model from disk
filename = 'finalized_model.sav'
model = load(open(filename, 'rb'))

## Predict

In [7]:
y = df.pop('accepted')

In [8]:
predictions = model.predict(df)

## Return list of customers to which an offer should be sent

In [9]:
predictions

array([1., 0., 1., ..., 0., 0., 0.])

In [10]:
predicted = pd.Series(predictions)

In [11]:
df['actual'] = y
df['predicted'] = predicted
df.head()

Unnamed: 0,gender,age,phone_calls,emails,salary,estimated_expenses,customer_type_C,customer_type_Q,customer_type_S,offer_class_High,offer_class_Medium,offer_class_Premium,actual,predicted
0,0.0,0.177419,0.0,0.0,0.412503,0.517787,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,1.0,0.193548,0.125,0.2,0.295806,0.287459,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.193548,0.125,0.3,0.295806,0.089689,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.193548,0.125,0.2,0.295806,0.376244,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.112903,0.125,0.2,0.295806,0.173802,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [12]:
len(df[df['actual'] != df['predicted']])

255

In [13]:
print('Accuracy comparing to actual values: ', 100 - (255 * 100 / 1284))

Accuracy comparing to actual values:  80.14018691588785


## Save predictions to CSV

In [14]:
df.to_csv('final_prediction.csv', index=None, header=True)