In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from geopy.distance import geodesic

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

import sklearn.cluster as cluster
from sklearn.cluster import KMeans

#from sklearn.mixture import GMM

from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import dendrogram, linkage

import warnings
from matplotlib import pyplot as plt
import itertools
#Skip to reading Step 2. way down 

warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv("merged_training_dataset.csv")

In [3]:
dataset = dataset[dataset['order_status'] == 'delivered']
len(dataset)

115728

In [4]:
dataset.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'quantity', 'product_id', 'seller_id', 'shipping_limit_date',
       'product_price', 'freight_value', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'product_category', 'review_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'seller_lat', 'sellet_lng', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'customer_lat', 'customer_lng'],
      dtype='object')

In [5]:
dataset.dropna(subset=[ 'order_delivered_customer_date', 'order_delivered_carrier_date', 'order_approved_at', 'seller_lat', 'customer_lat', 'product_category'],inplace=True)
len(dataset)

113487

In [6]:
dataset.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,quantity,product_id,...,seller_city,seller_state,seller_lat,sellet_lng,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_lat,customer_lng
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered,2017-09-13T08:59:02Z,2017-09-13T09:45:35Z,2017-09-19T18:34:16Z,2017-09-20T23:43:48Z,2017-09-29T00:00:00Z,1,4244733e06e7ecb4970a6e2683c13e61,...,volta redonda,SP,-22.498183,-44.123614,871766c5855e863f6eccc05f988b23cb,28013,campos dos goytacazes,RJ,-21.758076,-41.312633
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,delivered,2017-04-26T10:53:06Z,2017-04-26T11:05:13Z,2017-05-04T14:35:00Z,2017-05-12T16:04:24Z,2017-05-15T00:00:00Z,1,e5f2d52b802189ee658865ca93d83a8f,...,sao paulo,SP,-23.566258,-46.518417,eb28e67c4c0b83846050ddfb8a35d051,15775,santa fe do sul,SP,-20.212393,-50.941471
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,delivered,2018-01-14T14:33:31Z,2018-01-14T14:48:30Z,2018-01-16T12:36:48Z,2018-01-22T13:19:16Z,2018-02-05T00:00:00Z,1,c777355d18b72b67abbeef9df44fd0fd,...,borda da mata,MG,-22.264094,-46.158564,3818d81c6709e39d06b2738a8d3a2474,35661,para de minas,MG,-19.860439,-44.597972
3,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,delivered,2018-08-08T10:00:35Z,2018-08-08T10:10:18Z,2018-08-10T13:28:00Z,2018-08-14T13:32:39Z,2018-08-20T00:00:00Z,1,7634da152a4610f1595efa32f14722fc,...,franca,SP,-20.548228,-47.395897,af861d436cfc08b2c2ddefd0ba074622,12952,atibaia,SP,-23.144923,-46.53983
4,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,delivered,2017-02-04T13:57:51Z,2017-02-04T14:10:13Z,2017-02-16T09:46:09Z,2017-03-01T16:42:31Z,2017-03-17T00:00:00Z,1,ac6c3623068f30de03045865e4e10089,...,loanda,PR,-22.931427,-53.133759,64b576fb70d441e8f1b2d7d446e483c5,13226,varzea paulista,SP,-23.249008,-46.824961


In [7]:
dataset.order_purchase_timestamp = pd.to_datetime(dataset.order_purchase_timestamp)
#dataset.order_aproved_at = pd.to_datetime(dataset.order_aproved_at)
dataset.order_estimated_delivery_date = pd.to_datetime(dataset.order_estimated_delivery_date)
dataset.order_delivered_customer_date = pd.to_datetime(dataset.order_delivered_customer_date)
dataset.order_approved_at = pd.to_datetime(dataset.order_approved_at)
dataset.shipping_limit_date = pd.to_datetime(dataset.shipping_limit_date)
dataset.order_delivered_carrier_date = pd.to_datetime(dataset.order_delivered_carrier_date)
dataset.review_creation_date = pd.to_datetime(dataset.review_creation_date)
dataset.review_answer_timestamp = pd.to_datetime(dataset.review_answer_timestamp)
dataset.product_category.replace(['None', 'NaN', np.nan], "NIL", inplace=True)
dataset['freight_rate'] = dataset[['freight_value','quantity']].apply(
    lambda row : round(row['freight_value'] / row['quantity'],2), axis=1
)

#dataset.pivot(index='', columns='name', values='dollars')

In [8]:
columns = ['order_id', 'product_id', 'seller_id', 'customer_id', 'customer_unique_id','review_id', 'quantity', 'payment_type']
#dataset = dataset.drop("index", axis = 1)
dataset['payment_sum'] = dataset.groupby(['order_id', 'product_id', 'seller_id', 'customer_id', 'customer_unique_id','review_id', 'quantity', 'payment_type'])['payment_value'].transform('sum')
dataset.drop_duplicates(subset = columns, keep = "first", inplace = True)

In [9]:
dataset2 = pd.pivot_table(dataset, index=['order_id', 'product_id', 'seller_id', 'customer_id', 'customer_unique_id','review_id', 'quantity']
               , columns='payment_type', values='payment_sum', fill_value = 0).reset_index()

In [10]:
columns.remove("payment_type")
dataset.drop_duplicates(subset = columns, keep = "first", inplace = True)

In [11]:
dataset = pd.merge(dataset,dataset2, on = ['order_id', 'product_id', 'seller_id', 'customer_id', 'customer_unique_id','review_id', 'quantity'], how = "right")

In [12]:
dataset.drop(['payment_type', 'payment_sum'], axis = 1, inplace = True)

In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108716 entries, 0 to 108715
Data columns (total 47 columns):
order_id                         108716 non-null object
customer_id                      108716 non-null object
order_status                     108716 non-null object
order_purchase_timestamp         108716 non-null datetime64[ns, UTC]
order_approved_at                108716 non-null datetime64[ns, UTC]
order_delivered_carrier_date     108716 non-null datetime64[ns, UTC]
order_delivered_customer_date    108716 non-null datetime64[ns, UTC]
order_estimated_delivery_date    108716 non-null datetime64[ns, UTC]
quantity                         108716 non-null int64
product_id                       108716 non-null object
seller_id                        108716 non-null object
shipping_limit_date              108716 non-null datetime64[ns, UTC]
product_price                    108716 non-null float64
freight_value                    108716 non-null float64
payment_sequential        

In [14]:
def split_train_test(data, test_ratio):
    np.random.seed(1234)    
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [15]:
train, test = split_train_test(dataset, 0.2)

print(len(train), len(test))

86973 21743


In [16]:
y_train = train['review_score'] >= 3 
y_test = test['review_score'] >= 3

In [17]:
def performance_metrics(y, y_pred):
    accuracy = sum(y_pred == y)/len(y)
    c_m = confusion_matrix(y, y_pred)
    TP = c_m[1][1]
    FP = c_m[0][1]
    FN = c_m[1][0]
    TN = c_m[0][0]
    #print(confusion_matrix(y, y_pred))
    BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
    return accuracy, BER

In [18]:
def hei_prod(train):
    product_cat = train.groupby('product_category').order_purchase_timestamp.count().reset_index()
    product_cat.columns = ['product_category', 'Frequency']
    ms = train.groupby('product_category').payment_value.sum().reset_index()
    ms.columns = ['product_category','Monetary']
    df_prd = pd.merge(product_cat,ms, on='product_category')
    pd_f_log = np.log(df_prd ['Frequency'])
    pd_m_log = np.log(df_prd ['Monetary']+0.1)
    log_pd_data = pd.DataFrame({'Monetary': pd_m_log,'Frequency': pd_f_log})
    linked = linkage(log_pd_data, 'weighted')
    k=2
    l1=pd.DataFrame(fcluster(linked, k, criterion='maxclust'))
    k=3
    l2=pd.DataFrame(fcluster(linked, k, criterion='maxclust'))
    k=5
    level3=fcluster(linked, k, criterion='maxclust')
    k=7
    l3= pd.DataFrame(fcluster(linked, k, criterion='maxclust'))
    k=10
    l4= pd.DataFrame(fcluster(linked, k, criterion='maxclust'))
    TH = pd.DataFrame({'productcategory': list(df_prd['product_category']),'Level1': l1[0],'Level2': l2[0],'Level3': l3[0],'Level4': l4[0] })
    level_onehot = TH.copy()
    level_onehot = pd.get_dummies(level_onehot, columns=['Level1','Level2','Level3','Level4'])
    return level_onehot
    
def kclus_cust(train):
    recency_scores = train.groupby('customer_unique_id').order_purchase_timestamp.max().reset_index()
    recency_scores.columns = ['customer_unique_id', 'MaxPurchaseDate']
    recency_scores['Recency'] = (recency_scores['MaxPurchaseDate'].max() - recency_scores['MaxPurchaseDate']).dt.days
    frequency_scores = train.groupby('customer_unique_id').order_purchase_timestamp.count().reset_index()
    frequency_scores.columns = ['customer_unique_id','Frequency']
    monetory_scores = train.groupby('customer_unique_id').payment_value.sum().reset_index()
    monetory_scores.columns = ['customer_unique_id','Monetary']
    df_rfm = pd.merge(recency_scores, frequency_scores, on='customer_unique_id')
    df_rfm = pd.merge(df_rfm, monetory_scores, on='customer_unique_id')
    rfm_r_log = np.log(df_rfm['Recency']+0.1) #can't take log(0) and so add a small number
    rfm_f_log = np.log(df_rfm['Frequency'])
    rfm_m_log = np.log(df_rfm['Monetary']+0.1)
    log_data = pd.DataFrame({'Monetary': rfm_m_log,'Recency': rfm_r_log,'Frequency': rfm_f_log})
    matrix = log_data.as_matrix()
    kmeans = KMeans(init='k-means++', n_clusters = 2, n_init=100)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    clusters1= pd.DataFrame({'Customer':df_rfm['customer_unique_id'], 'cluster': clusters})
    return(clusters1)

In [19]:
one_hot_product = hei_prod(dataset)

#one_hot_customer = kclus_cust(dataset)

In [25]:
categories = dataset['product_category'].unique().tolist()
seller_popularities = pd.DataFrame(train.groupby('seller_id').size().reset_index(name = "Count"))


def features(data):
    data.fillna(0)
    
    #feat1 = one_hot_product[one_hot_product['productcategory'] == data['product_category']].iloc[0, 1:].tolist()
    #feat2 = one_hot_customer[one_hot_customer['Customer'] == data['customer_unique_id']].cluster.tolist()
    
    review_length = 0 if (pd.isnull(data['review_comment_message'])) else len(data['review_comment_message'])
    delivery_time = data['order_delivered_customer_date'] - data['order_estimated_delivery_date']
    approving_time = data['order_approved_at'] - data['order_purchase_timestamp']
    processing_time = data['order_delivered_carrier_date'] - data['order_approved_at'] 
    shipping_time = data['shipping_limit_date'] - data['order_delivered_carrier_date']
    purchase_month = data['order_purchase_timestamp'].month
    credit_pay = data['credit_card']
    debit_pay = data['debit_card']
    voucher_pay = data['voucher']
    boleto_wallet_pay = data['boleto']
    price = 0 if (pd.isnull(data['product_price'])) else data['product_price']
    
    desc_length = 0 if (pd.isnull(data['product_description_lenght'])) else data['product_description_lenght']
    photos = 0 if (pd.isnull(data['product_photos_qty'])) else data['product_photos_qty']
    price = data['product_price']
    time_to_review = data['review_answer_timestamp'] - data['order_delivered_customer_date']
    seller_coord = (data['seller_lat'], data['sellet_lng'])
    customer_coord = (data['customer_lat'], data['customer_lng'])
    distance = geodesic(seller_coord, customer_coord).miles
    popularity = seller_popularities[seller_popularities["seller_id"] == data["seller_id"]].Count.values[0] if data["seller_id"] in seller_popularities.seller_id else 0
    freight_rate = data["freight_rate"]
    
    return [review_length, delivery_time.days, approving_time.days, approving_time.days, processing_time.days, 
                    shipping_time.days, purchase_month, credit_pay, debit_pay, voucher_pay, boleto_wallet_pay, desc_length,
                    photos, price, distance, time_to_review.days, popularity, freight_rate]

features(dataset.iloc[1,])

NameError: name 'feat1' is not defined

In [21]:
X_train = [features(row) for _, row in train.iterrows()]

In [22]:
X_test = [features(row) for _, row in test.iterrows()]

In [23]:
min_max_scaler = preprocessing.MinMaxScaler()
std_X_train = min_max_scaler.fit_transform(X_train)
std_X_test = min_max_scaler.transform(X_test)

In [None]:
# Linear Classification models

def perform_regression(val, X_train, y_train, X_test, y_test, hyperparameters):
    acc_train = []
    BER_valid = []
    acc_test = []
    for c in hyperparameters:
        if val == 1:
            model = LogisticRegression(C = c)
        if val == 2:
            model = LinearSVC(C=c)
        tr_acc = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 3).mean() * 100
        acc_train.append(tr_acc)
    plt.title('Accuracy vs Hyperparameter value')
    plt.plot(hyperparameters, acc_train, "b-+", label="train")
    #plt.plot(hyperparameters, BER_valid, "g-*", label="valid")
    #plt.plot(hyperparameters, acc_test, "r-*", label="test")
    plt.xlabel("Hyperparameter value")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

hyperparameters = [ 0.01, 0.1, 1, 10, 100]

In [None]:
perform_regression(1, std_X_train, y_train, std_X_test, y_test, hyperparameters)

In [None]:
log_reg = LogisticRegression(C = 10 )
log_reg.fit(std_X_train, y_train)

y_train_pred = log_reg.predict(std_X_train)

print(performance_metrics(y_train, y_train_pred))
      
y_test_pred = log_reg.predict(std_X_test)
performance_metrics(y_test, y_test_pred)

In [None]:
perform_regression(2, std_X_train, y_train, std_X_test, y_test, hyperparameters)

In [None]:
lin_SVM = LinearSVC(C=10)
lin_SVM.fit(std_X_train, y_train)

y_train_pred = lin_SVM.predict(std_X_train)

print(performance_metrics(y_train, y_train_pred))
      
      
y_test_pred = lin_SVM.predict(std_X_test)
performance_metrics(y_test, y_test_pred)

In [None]:
log_reg = LogisticRegression(C = 1.0, class_weight='balanced')
random_forest = RandomForestClassifier(n_estimators=5)
lin_SVM = SVC(kernel="linear", C=1.0)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [None]:
accuracy = cross_val_score(random_forest, std_X_train, y_train, scoring='accuracy', cv = 5).mean() * 100
print("Accuracy of Random Forest is: " , accuracy)

In [None]:
grid_values = {'n_estimators': [1, 10, 20, 100, 200], 'max_features': [2, 3]}

random_forest = RandomForestClassifier()

grid_search = GridSearchCV(random_forest, param_grid = grid_values, scoring = 'accuracy', cv = 3, return_train_score=True)

grid_search.fit(std_X_train, y_train)

grid_search.best_params_



In [None]:
#Randomized Search for random forest model

from sklearn.model_selection import RandomizedSearchCV


# Number of trees in random forest
n_estimators = [1, 10, 20, 100, 200]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [80, 90, 100]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


# Use the random grid to search for best hyperparameters
# First create the base model to tune
#rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(std_X_train, y_train)

#rf_random.best_params_

random_forest = RandomForestClassifier()

random_forest.fit(std_X_train,y_train)
y_train_pred = random_forest.predict(std_X_train)
performance_metrics(y_train, y_train_pred)

In [None]:
random_forest = RandomForestClassifier(max_features = 3, n_estimators = 200, max_depth = 50)

random_forest.fit(std_X_train,y_train)


feature_importances = pd.DataFrame(random_forest.feature_importances_,
                                   index = range(1, 41),
                                   columns=['importance']).sort_values('importance', ascending=False)

feature_importances

In [None]:
feature_importances.index

In [None]:
random_forest = RandomForestClassifier(max_features = 2, n_estimators = 200, max_depth = 40)

random_forest.fit(std_X_train2,y_train)

y_train_pred = random_forest.predict(std_X_train2)
performance_metrics(y_train, y_train_pred)


In [None]:
y_train_pred = random_forest.predict(std_X_train)
performance_metrics(y_train, y_train_pred)

In [None]:
y_test_pred = random_forest.predict(std_X_test2)
performance_metrics(y_test, y_test_pred)

In [None]:
n_estimators = [20, 100, 200]
max_features = [2, 3, 5]
max_depth = [20, 40, 50]

thresholds_criteria = list(itertools.product(n_estimators, max_features, max_depth))

train_accuracy = []
test_accuracy = []
n_est = [] 
max_f = []
max_d = []

for n, m, d in thresholds_criteria:
    rf = RandomForestClassifier(max_features = m, n_estimators = n, max_depth = d)
    rf.fit(std_X_train2,y_train)
    y_train_pred = rf.predict(std_X_train2)
    acc, _ = performance_metrics(y_train, y_train_pred)
    train_accuracy.append(acc)
    y_test_pred = rf.predict(std_X_test2)
    acc, _ = performance_metrics(y_test, y_test_pred)
    test_accuracy.append(acc)
    n_est.append(n)
    max_f.append(m)
    max_d.append(d)
    
df = {"N-estimators": n_est,
      "Max_features": max_f,
      "Max_Depth": max_d,
      "Train_accuracy": train_accuracy,
      "Test Accuracy": test_accuracy}


df = pd.DataFrame(df)

#printing dataframe for summary
df.sort_values(
    by=["Test Accuracy", "Train_accuracy"], ascending=[False, False]
)[:7]

In [None]:
#printing dataframe for summary



In [None]:
df

In [None]:
accuracy = cross_val_score(lin_SVM, std_X_train, y_train, scoring='accuracy', cv = 5).mean() * 100
print("Accuracy of Random Forest is: " , accuracy)

In [None]:
#Linear SVM model tuning
perform_regression(2, std_X_train, y_train, std_X_test, y_test, hyperparameters)

lin_SVM = SVC(kernel="linear", C=1.0)
lin_SVM.fit(std_X_train, y_train)

y_train_pred = lin_SVM.predict(std_X_train)
performance_metrics(y_train, y_train_pred)

y_test_pred = lin_SVM.predict(std_X_test)
performance_metrics(y_test, y_test_pred)

In [None]:
accuracy = cross_val_score(knn_classifier, std_X_train, y_train, scoring='accuracy', cv = 5).mean() * 100
print("Accuracy of KNN is: ", accuracy)

In [None]:
knn_classifier.fit(std_X_train, y_train)
knn_classifier.score(std_X_test, y_test)

In [None]:
y_test_pred = random_forest.predict(std_X_test)
performance_metrics(y_test, y_test_pred)

In [None]:
performance_metrics(y_test, [1] * len(y_test))

In [None]:
def features(data):
    data.fillna(0)
    
    feat1 = one_hot_product[one_hot_product['productcategory'] == data['product_category']].iloc[0, 1:].tolist()
    #feat2 = one_hot_customer[one_hot_customer['Customer'] == data['customer_unique_id']].cluster.tolist()
    
    review_length = 0 if (pd.isnull(data['review_comment_message'])) else len(data['review_comment_message'])
    delivery_time = data['order_delivered_customer_date'] - data['order_estimated_delivery_date']
    approving_time = data['order_approved_at'] - data['order_purchase_timestamp']
    processing_time = data['order_delivered_carrier_date'] - data['order_approved_at'] 
    shipping_time = data['shipping_limit_date'] - data['order_delivered_carrier_date']
    purchase_month = data['order_purchase_timestamp'].month
    credit_pay = data['credit_card']
    debit_pay = data['debit_card']
    voucher_pay = data['voucher']
    boleto_wallet_pay = data['boleto']
    price = 0 if (pd.isnull(data['product_price'])) else data['product_price']
    
    desc_length = 0 if (pd.isnull(data['product_description_lenght'])) else data['product_description_lenght']
    photos = 0 if (pd.isnull(data['product_photos_qty'])) else data['product_photos_qty']
    price = data['product_price']
    time_to_review = data['review_answer_timestamp'] - data['order_delivered_customer_date']
    seller_coord = (data['seller_lat'], data['sellet_lng'])
    customer_coord = (data['customer_lat'], data['customer_lng'])
    distance = geodesic(seller_coord, customer_coord).miles
    popularity = seller_popularities[seller_popularities["seller_id"] == data["seller_id"]].Count.values[0] if data["seller_id"] in seller_popularities.seller_id else 0
    freight_rate = data["freight_rate"]
    
    return [review_length, delivery_time.days, approving_time.days, approving_time.days, processing_time.days, 
                    shipping_time.days, purchase_month, credit_pay, debit_pay, voucher_pay, boleto_wallet_pay, desc_length,
                    photos, price, distance, time_to_review.days, popularity, freight_rate]

features(dataset.iloc[1,])

In [None]:
X_train2 = [features(row) for _, row in train.iterrows()]
X_test2 = [features(row) for _, row in test.iterrows()]

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
std_X_train2 = min_max_scaler.fit_transform(X_train2)
std_X_test2 = min_max_scaler.transform(X_test2)

In [None]:
# Linear Classification models

def perform_regression(val, X_train, y_train, X_test, y_test, hyperparameters):
    acc_train = []
    acc_test = []
    for c in hyperparameters:
        
        model = LogisticRegression(C = c)
        if val == 2:
            model = LinearSVC(C=c)
        if val == 3:
            model = knn_classifier = KNeighborsClassifier(n_neighbors=c)
        valid_acc = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 3).mean() * 100
        acc_train.append(valid_acc)
        #odel.fit(X_train, y_train)
        #acc,_ = performance_metrics(y_test, model.predict(X_test))
        #cc_test.append(acc)
    plt.title('Accuracy vs Hyperparameter value')
    plt.plot(hyperparameters, acc_train, "b-+", label="train")
    #plt.plot(hyperparameters, acc_test, "r-*", label="test")
    plt.xlabel("Hyperparameter value")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

hyperparameters = [ 0.001, 0.01, 0.1, 1, 10, 100]

In [None]:
perform_regression(1, std_X_train3, y_train, std_X_test3, y_test, hyperparameters)

In [None]:
log_reg = LogisticRegression(C = 100)
log_reg.fit(std_X_train3, y_train)

y_train_pred = log_reg.predict(std_X_train3)

print(performance_metrics(y_train, y_train_pred))
      
      
y_test_pred = log_reg.predict(std_X_test3)
performance_metrics(y_test, y_test_pred)

In [None]:
perform_regression(2, std_X_train2, y_train, std_X_test2, y_test, hyperparameters)

In [None]:
lin_SVM = LinearSVC(C=100)
lin_SVM.fit(std_X_train2, y_train)

y_train_pred = lin_SVM.predict(std_X_train2)

print(performance_metrics(y_train, y_train_pred))
      
      
y_test_pred = lin_SVM.predict(std_X_test2)
performance_metrics(y_test, y_test_pred)

In [None]:
hyperparameters = [2, 3, 5, 7, 10]
perform_regression(3, std_X_train, y_train, std_X_test, y_test, hyperparameters)

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(std_X_train, y_train)

y_train_pred = knn_classifier.predict(std_X_train)

print(performance_metrics(y_train, y_train_pred))
      


In [None]:
y_test_pred = knn_classifier.predict(std_X_test)
performance_metrics(y_test, y_test_pred)

In [None]:
 def features(data):
    data.fillna(0)
    
    feat1 = one_hot_product[one_hot_product['productcategory'] == data['product_category']].iloc[0, 1:].tolist()
    #feat2 = one_hot_customer[one_hot_customer['Customer'] == data['customer_unique_id']].cluster.tolist()
    
    review_length = 0 if (pd.isnull(data['review_comment_message'])) else len(data['review_comment_message'])
    delivery_time = data['order_delivered_customer_date'] - data['order_estimated_delivery_date']
    approving_time = data['order_approved_at'] - data['order_purchase_timestamp']
    processing_time = data['order_delivered_carrier_date'] - data['order_approved_at'] 
    shipping_time = data['shipping_limit_date'] - data['order_delivered_carrier_date']
    purchase_month = data['order_purchase_timestamp'].month
    credit_pay = data['credit_card']
    debit_pay = data['debit_card']
    voucher_pay = data['voucher']
    boleto_wallet_pay = data['boleto']
    price = 0 if (pd.isnull(data['product_price'])) else data['product_price']
    
    desc_length = 0 if (pd.isnull(data['product_description_lenght'])) else data['product_description_lenght']
    photos = 0 if (pd.isnull(data['product_photos_qty'])) else data['product_photos_qty']
    price = data['product_price']
    time_to_review = data['review_answer_timestamp'] - data['order_delivered_customer_date']
    seller_coord = (data['seller_lat'], data['sellet_lng'])
    customer_coord = (data['customer_lat'], data['customer_lng'])
    distance = geodesic(seller_coord, customer_coord).miles
    popularity = seller_popularities[seller_popularities["seller_id"] == data["seller_id"]].Count.values[0] if data["seller_id"] in seller_popularities.seller_id else 0
    freight_rate = data["freight_rate"]
    
    return [review_length, desc_length,
                    photos, price, popularity, freight_rate]

features(dataset.iloc[1,])

In [None]:
X_train3 = [features(row) for _, row in train.iterrows()]
X_test3 = [features(row) for _, row in test.iterrows()]

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
std_X_train3 = min_max_scaler.fit_transform(X_train3)
std_X_test3 = min_max_scaler.transform(X_test3)

In [24]:
rbf_svc = SVC(kernel='rbf', gamma=0.7, C=100)

rbf_svc.fit(std_X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_train_pred = rbf_svc.predict(std_X_train)

print(performance_metrics(y_train, y_train_pred))

y_test_pred = rbf_svc.predict(std_X_test)
performance_metrics(y_test, y_test_pred)