In [7]:
# Operating System
import os

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

import chardet

from sklearn.model_selection import train_test_split

from sklearn.pipeline import  Pipeline, make_pipeline, FeatureUnion, make_union
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer, Binarizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.impute import SimpleImputer

from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, classification_report
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import confusion_matrix


#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Import submission set

In [2]:
dir_path = os.path.join('data', 'transformed_data', 'data_test', 'test')
#print(dir_path)

df_submission = None
df_aux = None
file_counter = 0

for filename in os.listdir(dir_path):
    
    if filename.endswith(".xlsx"): 
        #print(os.path.join(dir_path, filename))
                
        df_list = []
        # Read two sheets in each file
        for sheet in ['sheet_1', 'sheet_2']:            
            try:
                df_aux = pd.read_excel(os.path.join(dir_path, filename), sheet_name=sheet, index_col=0).sort_index()
            except:
                print(os.path.join(dir_path, filename))
                print("{} reading error!".format(sheet))
                break
            
            df_list.append(df_aux.copy())
            del df_aux
            
        # Confirm both sheets had the same elements
        if df_list[0].index.equals(df_list[1].index) == False:
            print("df_list[0].index doesn't match df_list[1].index")
            break
        
        # Merge sheets
        df_aux_merged = pd.concat(df_list, axis='columns', sort=False)
        file_counter += 1
        del df_list
       
    
    
    if df_submission is None:        
        # Se for o 1º ficheiro a ser lido cria a dataframe        
        df_submission = df_aux_merged.copy()
        del df_aux_merged        
    else:
        
        # Confirmar se tem as mesmas colunas
        if df_submission.columns.equals(df_aux_merged.columns):
            df_submission = df_submission.append(df_aux_merged)
            del df_aux_merged
            
        else:
            print(os.path.join(dir_path, filename))
            print("File columns doesn't match with existing df_submission.")
            break


df_submission = df_submission.sort_index()
# Confirm that it is equal to the sample submission ids
try: 
    df_aux = pd.read_csv(os.path.join('data', 'sample_submission.csv'), index_col=0).sort_index()
except:
    print("Unable to read '{}' file".format(os.path.join('data', 'sample_submission.csv')))
                         
if df_submission.index.equals(df_aux.index) == False:
    print("ERROR!!! df_submission ids are not equal to 'sample_submission.csv' ids")
                        
print("{} files read".format(file_counter))

100 files read


In [3]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93310 entries, 00007c55a9a7591b98a76d79216c9112 to ffff9979c9699b51cb7cda98e5bf84c2
Data columns (total 30 columns):
orderportalid             93310 non-null int64
orderdate_gmt             93310 non-null object
designer                  93310 non-null int64
style                     93310 non-null int64
shipper                   93310 non-null int64
shiptypeid                93310 non-null int64
userid                    93202 non-null float64
isvip                     93202 non-null object
country                   93310 non-null int64
region                    93310 non-null int64
ddprate                   93310 non-null float64
countrycode               93310 non-null int64
hasusedwishlist           93202 non-null object
isreseller                93202 non-null object
hasitemsonbag             93202 non-null object
tierafterorder            85809 non-null object
tierbeforeorder           57397 non-null object
isusingmultipledevices    93

In [17]:
type(df_submission.dtypes)

pandas.core.series.Series

# Import data source 1

In [46]:
file_path = os.path.join('data', 'datasource1.csv')

with open(file_path, 'rb') as f:
    read_nbytes = 100000
    encoding_dict = chardet.detect(f.read(read_nbytes))

encoding_dict

{'encoding': 'windows-1251',
 'confidence': 0.5730969056434915,
 'language': 'Bulgarian'}

### Read the file

In [88]:
df_aux = pd.read_csv(os.path.join('data', 'datasource1.csv'), encoding=file_encod, index_col=0)

In [99]:
df_aux.iloc[:,5].head()

id
cfcd208495d565ef66e7dff9f98764da    Срећно! (Good luck!)
c4ca4238a0b923820dcc509a6f75849b    Срећно! (Good luck!)
c81e728d9d4c2f636f067f89cc14862c    Срећно! (Good luck!)
eccbc87e4b5ce2fe28308fd9f2a7baf3    Срећно! (Good luck!)
a87ff679a2f3e71d9181a67b7542122c    Срећно! (Good luck!)
Name: ldsa_team_wishes_you, dtype: object

In [89]:
df_aux.info()

<class 'pandas.core.frame.DataFrame'>
Index: 543341 entries, cfcd208495d565ef66e7dff9f98764da to a25519337908a5d92a979aae8cff43cb
Data columns (total 6 columns):
tierafterorder          511877 non-null object
orderportalid           543341 non-null object
size                    543341 non-null object
orderdate_gmt           543341 non-null object
hasusedwishlist         542659 non-null object
ldsa_team_wishes_you    543341 non-null object
dtypes: object(6)
memory usage: 29.0+ MB


In [90]:
file_encod = encoding_dict['encoding']

cols_to_use = [col for col in df_aux.columns if col in df_submission.columns]
cols_to_convert = [col for col in df_submission.select_dtypes(include=np.number).columns if col in cols_to_use]
df_submission_dtypes = df_submission.dtypes


df_source1 = df_aux.copy()

# keep only those columns that exist in df_submission
df_source1 = df_source1[cols_to_use]

# Convert numerical columns to numeric dtype
df_source1[cols_to_convert] = df_source1[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop observation if having na values in any columns
df_source1 = df_source1.dropna()

# Force types to be exactly the same as in df_submission_dtypes
df_source1 = df_source1.astype(df_submission_dtypes[cols_to_use])

# print(df_source1.dtypes)
# print(df_submission[cols_to_use].dtypes)



tierafterorder     object
orderportalid       int64
size                int64
orderdate_gmt      object
hasusedwishlist    object
dtype: object
tierafterorder     object
orderportalid       int64
size                int64
orderdate_gmt      object
hasusedwishlist    object
dtype: object


In [92]:
df_source1.head()

Unnamed: 0_level_0,tierafterorder,orderportalid,size,orderdate_gmt,hasusedwishlist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cfcd208495d565ef66e7dff9f98764da,T4,1,1,2018-01-01 00:00:19.733000+00:00,Yes
c4ca4238a0b923820dcc509a6f75849b,T3,2,2,2018-01-01 00:00:42.540000+00:00,Yes
c81e728d9d4c2f636f067f89cc14862c,T3,3,3,2018-01-01 00:01:15.893000+00:00,No
eccbc87e4b5ce2fe28308fd9f2a7baf3,T3,3,4,2018-01-01 00:01:15.893000+00:00,No
a87ff679a2f3e71d9181a67b7542122c,T4,4,5,2018-01-01 00:01:51.450000+00:00,No


In [100]:
df_source1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 461298 entries, cfcd208495d565ef66e7dff9f98764da to a25519337908a5d92a979aae8cff43cb
Data columns (total 5 columns):
tierafterorder     461298 non-null object
orderportalid      461298 non-null int64
size               461298 non-null int64
orderdate_gmt      461298 non-null object
hasusedwishlist    461298 non-null object
dtypes: int64(2), object(3)
memory usage: 21.1+ MB
