In [202]:
# Operating System
import os

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

# detect file encoding
import chardet

from sqlalchemy import create_engine

# HTTP requests
import requests

from sklearn.model_selection import train_test_split

from sklearn.pipeline import  Pipeline, make_pipeline, FeatureUnion, make_union
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer, Binarizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.impute import SimpleImputer

from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, classification_report
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import confusion_matrix


#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Import submission set

In [2]:
dir_path = os.path.join('data', 'transformed_data', 'data_test', 'test')
#print(dir_path)

df_submission = None
df_aux = None
file_counter = 0

for filename in os.listdir(dir_path):
    
    if filename.endswith(".xlsx"): 
        #print(os.path.join(dir_path, filename))
                
        df_list = []
        # Read two sheets in each file
        for sheet in ['sheet_1', 'sheet_2']:            
            try:
                df_aux = pd.read_excel(os.path.join(dir_path, filename), sheet_name=sheet, index_col=0).sort_index()
            except:
                print(os.path.join(dir_path, filename))
                print("{} reading error!".format(sheet))
                break
            
            df_list.append(df_aux.copy())
            del df_aux
            
        # Confirm both sheets had the same elements
        if df_list[0].index.equals(df_list[1].index) == False:
            print("df_list[0].index doesn't match df_list[1].index")
            break
        
        # Merge sheets
        df_aux_merged = pd.concat(df_list, axis='columns', sort=False)
        file_counter += 1
        del df_list
       
    
    
    if df_submission is None:        
        # Se for o 1º ficheiro a ser lido cria a dataframe        
        df_submission = df_aux_merged.copy()
        del df_aux_merged        
    else:
        
        # Confirmar se tem as mesmas colunas
        if df_submission.columns.equals(df_aux_merged.columns):
            df_submission = df_submission.append(df_aux_merged)
            del df_aux_merged
            
        else:
            print(os.path.join(dir_path, filename))
            print("File columns doesn't match with existing df_submission.")
            break


df_submission = df_submission.sort_index()
# Confirm that it is equal to the sample submission ids
try: 
    df_aux = pd.read_csv(os.path.join('data', 'sample_submission.csv'), index_col=0).sort_index()
except:
    print("Unable to read '{}' file".format(os.path.join('data', 'sample_submission.csv')))
                         
if df_submission.index.equals(df_aux.index) == False:
    print("ERROR!!! df_submission ids are not equal to 'sample_submission.csv' ids")
                        
print("{} files read".format(file_counter))

100 files read


In [3]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93310 entries, 00007c55a9a7591b98a76d79216c9112 to ffff9979c9699b51cb7cda98e5bf84c2
Data columns (total 30 columns):
orderportalid             93310 non-null int64
orderdate_gmt             93310 non-null object
designer                  93310 non-null int64
style                     93310 non-null int64
shipper                   93310 non-null int64
shiptypeid                93310 non-null int64
userid                    93202 non-null float64
isvip                     93202 non-null object
country                   93310 non-null int64
region                    93310 non-null int64
ddprate                   93310 non-null float64
countrycode               93310 non-null int64
hasusedwishlist           93202 non-null object
isreseller                93202 non-null object
hasitemsonbag             93202 non-null object
tierafterorder            85809 non-null object
tierbeforeorder           57397 non-null object
isusingmultipledevices    93

In [17]:
type(df_submission.dtypes)

pandas.core.series.Series

# Querying SQL database

In [201]:
host = 'data-wrangling-batch3.cl9uj9cucww7.eu-west-1.rds.amazonaws.com'
port = 5432
database = 'datawrangling'
user = 'ldsa_student'
password = 'JGIYc6jD' # replace with the datab ase password!

In [205]:
# Db settings - PostgreSQL
username = 'ldsa_student'
password = 'JGIYc6jD'  # the password is not XXX by the way
host_name = 'data-wrangling-batch3.cl9uj9cucww7.eu-west-1.rds.amazonaws.com'
port = 5432
db_name = 'datawrangling'
schema = 'hackathon_students'

conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host_name, port, db_name)
conn_args = {'options': '-csearch_path={}'.format(schema)}

In [209]:
engine = create_engine(conn_str, connect_args=conn_args)

In [210]:
query = 'SELECT * FROM hackathon_students.targets;'
targets = pd.read_sql_query(query, engine)

In [212]:
targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543341 entries, 0 to 543340
Data columns (total 2 columns):
id          543341 non-null object
returned    543341 non-null int64
dtypes: int64(1), object(1)
memory usage: 8.3+ MB


In [203]:
db_string = f"postgres://{user}:{password}@{host}:{port}/{database}"
db = create_engine(db_string)

In [204]:
conn = db.connect()

OperationalError: (psycopg2.OperationalError) could not connect to server: Connection timed out (0x0000274C/10060)
	Is the server running on host "data-wrangling-batch3.cl9uj9cucww7.eu-west-1.rds.amazonaws.com" (18.200.103.249) and accepting
	TCP/IP connections on port 5432?
 (Background on this error at: http://sqlalche.me/e/e3q8)

# Import data source 1

In [46]:
file_path = os.path.join('data', 'datasource1.csv')

with open(file_path, 'rb') as f:
    read_nbytes = 100000
    encoding_dict = chardet.detect(f.read(read_nbytes))

encoding_dict

{'encoding': 'windows-1251',
 'confidence': 0.5730969056434915,
 'language': 'Bulgarian'}

### Read the file

In [88]:
df_aux = pd.read_csv(os.path.join('data', 'datasource1.csv'), encoding=file_encod, index_col=0)

In [89]:
df_aux.info()

<class 'pandas.core.frame.DataFrame'>
Index: 543341 entries, cfcd208495d565ef66e7dff9f98764da to a25519337908a5d92a979aae8cff43cb
Data columns (total 6 columns):
tierafterorder          511877 non-null object
orderportalid           543341 non-null object
size                    543341 non-null object
orderdate_gmt           543341 non-null object
hasusedwishlist         542659 non-null object
ldsa_team_wishes_you    543341 non-null object
dtypes: object(6)
memory usage: 29.0+ MB


In [90]:
file_encod = encoding_dict['encoding']

cols_to_use = [col for col in df_aux.columns if col in df_submission.columns]
cols_to_convert = [col for col in df_submission.select_dtypes(include=np.number).columns if col in cols_to_use]
df_submission_dtypes = df_submission.dtypes


df_source1 = df_aux.copy()

# keep only those columns that exist in df_submission
df_source1 = df_source1[cols_to_use]

# Convert numerical columns to numeric dtype
df_source1[cols_to_convert] = df_source1[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop observation if having na values in any columns
df_source1 = df_source1.dropna()

# Force types to be exactly the same as in df_submission_dtypes
df_source1 = df_source1.astype(df_submission_dtypes[cols_to_use])

# print(df_source1.dtypes)
# print(df_submission[cols_to_use].dtypes)



tierafterorder     object
orderportalid       int64
size                int64
orderdate_gmt      object
hasusedwishlist    object
dtype: object
tierafterorder     object
orderportalid       int64
size                int64
orderdate_gmt      object
hasusedwishlist    object
dtype: object


# Import API data

In [104]:
API_url = 'https://y29rdnycjd.execute-api.eu-west-1.amazonaws.com/dev/'
request = requests.get(API_url)

In [106]:
request.ok

True

In [107]:
endpoint_url = API_url + 'missingdata/{id}'
endpoint_url

'https://y29rdnycjd.execute-api.eu-west-1.amazonaws.com/dev/missingdata/{id}'

### Performing a sample request

In [165]:
sample_id = df_source1.sample(1).index.values[0]
sample_id = '19ca14e7ea6328a42e0eb13d585e4c22'

In [166]:
request = requests.get(endpoint_url.format(id=sample_id))

In [167]:
request.request.url

'https://y29rdnycjd.execute-api.eu-west-1.amazonaws.com/dev/missingdata/19ca14e7ea6328a42e0eb13d585e4c22'

In [168]:
request.ok

True

In [172]:
request.json()

{'orderportalid': 382388,
 'orderdate_gmt': '2018-01-01 00:15:06.020000+00:00',
 'designer': 4295,
 'style': 4299,
 'shipper': 2,
 'shiptypeid': 2,
 'userid': 257187.0,
 'isvip': 'Not VIP',
 'country': 1,
 'region': 1,
 'ddprate': 5.0083,
 'countrycode': 1,
 'hasusedwishlist': 'Yes',
 'isreseller': 'No',
 'hasitemsonbag': 'No',
 'tierafterorder': None,
 'tierbeforeorder': None,
 'isusingmultipledevices': 'Yes',
 'userfraudstatus': 3,
 'promocode': 1,
 'freereturn': 1,
 'issale': 'Yes',
 'productid': 4450,
 'brand': 337,
 'ddpsubcategory': 'Footwear with outer soles of rubber or plastics',
 'storeid': 5,
 'countryoforigin': 1,
 'size': 12,
 'category_1stlevel': 'Shoes',
 'platform': 'web',
 'returned': None}

In [197]:
# ids_to_request_series = df_aux[df_aux.tierafterorder == 'API'].reset_index().id
# replies_list = []

# for id_to_request in ids_to_request_series:
    
#     request = requests.get(endpoint_url.format(id=id_to_request))
#     if request.ok:
#         replies_list.append(request.json())
#     else:
#         pass
    
# df_train = pd.DataFrame(replies_list)

In [200]:
df_train.returned.nunique()

0

In [193]:
df_aux[df_aux.tierafterorder == 'API'].reset_index().id[0:20]

pandas.core.series.Series

In [189]:
replies_list[0]

{'orderportalid': 382388,
 'orderdate_gmt': '2018-01-01 00:15:06.020000+00:00',
 'designer': 4295,
 'style': 4299,
 'shipper': 2,
 'shiptypeid': 2,
 'userid': 257187.0,
 'isvip': 'Not VIP',
 'country': 1,
 'region': 1,
 'ddprate': 5.0083,
 'countrycode': 1,
 'hasusedwishlist': 'Yes',
 'isreseller': 'No',
 'hasitemsonbag': 'No',
 'tierafterorder': None,
 'tierbeforeorder': None,
 'isusingmultipledevices': 'Yes',
 'userfraudstatus': 3,
 'promocode': 1,
 'freereturn': 1,
 'issale': 'Yes',
 'productid': 4450,
 'brand': 337,
 'ddpsubcategory': 'Footwear with outer soles of rubber or plastics',
 'storeid': 5,
 'countryoforigin': 1,
 'size': 12,
 'category_1stlevel': 'Shoes',
 'platform': 'web',
 'returned': None}

In [188]:
pd.DataFrame(replies_list)

Unnamed: 0,brand,category_1stlevel,country,countrycode,countryoforigin,ddprate,ddpsubcategory,designer,freereturn,hasitemsonbag,...,returned,shipper,shiptypeid,size,storeid,style,tierafterorder,tierbeforeorder,userfraudstatus,userid
0,337,Shoes,1,1,1,5.0083,Footwear with outer soles of rubber or plastics,4295,1,No,...,,2,2,12,5,4299,,,3,257187.0
1,681,Clothing,1,1,8,0.0,Skirts,79959,1,No,...,,1,1,395,764,79978,,,3,270119.0
2,9,Bags,3,3,1,0.0,"Handbags, whether or not with shoulder strap, ...",13129,1,Yes,...,,2,2,17,174,13136,,,3,264764.0
3,92,Clothing,4,4,1,0.0,"Jerseys, pullovers, cardigans, waistcoats and ...",268,1,Yes,...,,2,9,35,156,268,VIP,VIP,6,195.0
4,1617,Teen Girl Clothing,1,1,1,5.0083,Dresses,148401,1,No,...,,4,2,10,163,148434,,,3,254415.0
5,50,Homeware,4,4,2,0.0,Articles of a kind normally carried in the poc...,428,1,No,...,,1,3,17,39,429,T4,,3,287.0
6,212,Shoes,1,1,1,5.0083,Other footwear,489,1,No,...,,2,2,12,32,491,T4,,3,333.0
7,67,Clothing,12,12,1,33.3833,"T-shirts, singlets and other vests",494,1,No,...,,2,2,14,11,496,T3,,3,334.0
8,216,Clothing,39,39,1,0.0,"Coats: overcoats, raincoat, cape, cloaks and s...",515,1,Yes,...,,1,3,7,106,517,VIP,VIP,6,342.0
9,141,Shoes,1,1,1,5.0083,Other footwear,150642,1,Yes,...,,2,2,12,598,150675,T4,T4,3,258716.0


In [175]:
df_aux.index.values[0:10]

array(['cfcd208495d565ef66e7dff9f98764da',
       'c4ca4238a0b923820dcc509a6f75849b',
       'c81e728d9d4c2f636f067f89cc14862c',
       'eccbc87e4b5ce2fe28308fd9f2a7baf3',
       'a87ff679a2f3e71d9181a67b7542122c',
       'e4da3b7fbbce2345d7772b0674a318d5',
       '1679091c5a880faf6fb5e6087eb1b2dc',
       '8f14e45fceea167a5a36dedd4bea2543',
       'c9f0f895fb98ab9159f51fd0297e236d',
       '45c48cce2e2d7fbdea1afc51c7c6ad26'], dtype=object)

In [153]:
df_source1.tierafterorder.value_counts()

T4     212069
T2      75849
T3      70034
VIP     65282
T1      38064
Name: tierafterorder, dtype: int64

In [158]:
df_aux.tierafterorder.value_counts()

T4         212474
T2          75885
T3          70078
VIP         65361
WEBSITE     40000
T1          38079
API         10000
Name: tierafterorder, dtype: int64

In [181]:
type(df_aux[df_aux.tierafterorder == 'API'].index.values)

numpy.ndarray