In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import chi2_contingency, pointbiserialr
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## 1. Importing data

In [3]:
def import_csv(filename, path = './data'):
    return pd.read_csv(os.path.join(path, filename))

In [4]:
train_activity = import_csv('act_train.csv')
people = import_csv('people.csv')

- Each row in the activity file represents a **unique activity** performed by a person on a certain date. Each activity has a unique activity_id.
- The activity file contains several different categories of activities. 
- **Type 1** activities are different from type 2-7 activities because there are more known chars associated with type 1 activities (**nine in total**) than type 2-7 activities (which have only one associated characteristic).
- The two files can be joined together using **person_id** as the common key. 
- All variables are categorical, with the exception of 'char_38' in the people file, which is a continuous numerical variable.

## 2. Data cleaning

In [5]:
def _process_date(dataframe, date_col):
    """
    A private function which preprocesses datetime information.
    Input:
    - dataframe
    - name of the date column (object)
    
    Output:
    - dataframe with new date columns: month, year, weekend_flg
    """
    df = dataframe.copy()
    df['date'] = pd.to_datetime(df[date_col])
    df['month']=df['date'].dt.month
    df['year']=df['date'].dt.year
    df['weekend_flg'] = (df['date'].dt.weekday >= 5).astype(int)
    df.drop(['date'], inplace=True, axis=1)
    return df

### 2.1. `People` dataframe

In [6]:
def clean_people(original_df):
    """
    The aim of this function is to prepare `people` df by unifying types of data.
    The function takes in a dataframe (specifically `people`) and returns a copy of the given dataframe, but with converted data types (all ints).
    """
    df = _process_date(original_df, 'date')
    
    for col in list(df.select_dtypes(include='object').columns):
        if col.startswith("char_") or col.startswith("group_"):
            try:
                df[col] = (df[col].apply(lambda x: x.split(" ")[1]).astype("float64")).astype('int64')
                print(f"{col} converted to int")
            except AttributeError:
                print(f"Can't convert {col} to int")

        elif col.startswith("people_"):
            try:
                df[col] = (df[col].apply(lambda x: x.split("_")[1]).astype("float64")).astype('int64')
                print(f'{col} converted to int')
            except AttributeError:
                print(f"Can't convert {col} to int")
                
    for col in list(df.select_dtypes(include=['bool', 'float64']).columns):
        try:
            df[col] = df[col].astype("int64")
            print(f"{col} converted to int")
        except AttributeError:
            print(f"Can't convert {col} to int")
    return df

In [7]:
people_df = clean_people(people)

people_id converted to int
char_1 converted to int
group_1 converted to int
char_2 converted to int
char_3 converted to int
char_4 converted to int
char_5 converted to int
char_6 converted to int
char_7 converted to int
char_8 converted to int
char_9 converted to int
char_10 converted to int
char_11 converted to int
char_12 converted to int
char_13 converted to int
char_14 converted to int
char_15 converted to int
char_16 converted to int
char_17 converted to int
char_18 converted to int
char_19 converted to int
char_20 converted to int
char_21 converted to int
char_22 converted to int
char_23 converted to int
char_24 converted to int
char_25 converted to int
char_26 converted to int
char_27 converted to int
char_28 converted to int
char_29 converted to int
char_30 converted to int
char_31 converted to int
char_32 converted to int
char_33 converted to int
char_34 converted to int
char_35 converted to int
char_36 converted to int
char_37 converted to int


In [8]:
del people

### 2.2. `Activity` dataframe

In [9]:
def clean_activity(original_df):
    """
    The aim of this function is to prepare `activity` df by unifying types of data.
    The function takes in a dataframe (specifically `activity`) and returns this dataframe, but with converted data types.
    """ 

    df = _process_date(original_df, 'date')
                               
    for col in list(df.select_dtypes(include='object').columns):
        if col.endswith("_id"):
            if col.startswith("activity"):
                try:
                    df[f"{col}_prefix"] = (df[col].apply(lambda x: x.split("_")[0][-1]).astype("float64")).astype("int64")
                    print(f"{col}_prefix created")
                except AttributeError:
                    print(f"Can't create {col}_prefix")
                try:
                    df[col] = (df[col].apply(lambda x: x.split("_")[1]).astype("float64")).astype("int64")
                    print(f"{col} converted to int")
                except AttributeError:
                    print(f"Can't convert {col} to int")              
            elif col.startswith("people"):
                try:
                    df[col] = (df[col].apply(lambda x: x.split("_")[1]).astype("float64")).astype("int64")
                    print(f"{col} converted to int")
                except AttributeError:
                    print(f"Can't convert {col} to int")
        else:
            df[col]= df[col].fillna('type -1')
            try:
                df[col] = (df[col].apply(lambda x: x.split(" ")[1]).astype("float64")).astype('int64')
                print(f"{col} converted to int")
            except AttributeError:
                print(f"Can't convert {col} to int")
                
    for col in list(df.select_dtypes(include=['bool', 'float64']).columns):
        try:
            df[col] = df[col].astype("int64")
            print(f"{col} converted to int")
        except AttributeError:
            print(f"Can't convert {col} to int")
    df.loc[:,'activity_index'] = df[['activity_id_prefix', 'activity_id']].apply(tuple, axis=1)
    return df

In [10]:
train_activity_df= clean_activity(train_activity)

people_id converted to int
activity_id_prefix created
activity_id converted to int
activity_category converted to int
char_1 converted to int
char_2 converted to int
char_3 converted to int
char_4 converted to int
char_5 converted to int
char_6 converted to int
char_7 converted to int
char_8 converted to int
char_9 converted to int
char_10 converted to int


In [11]:
del train_activity

### 2.3. Merging data 

Data is being merged at this point to enable a different level of data exploration.

In [12]:
# new pandas 1.0 feature - convert_dtypes(), to handle missing values
red_hat = pd.merge(people_df, train_activity_df, how = 'left', on = 'people_id', suffixes = ('_pep', '_act')).convert_dtypes()

Each row represents an acitivity of a specific person (merged by `people_id`).

In [13]:
#deleting records where outcome or activity_id are NaNs, because they are useless in case of modeling
red_hat = red_hat[(pd.isna(red_hat['activity_id'])== False) & (pd.isna(red_hat['outcome'])== False)]

In [14]:
train_set, test_set = train_test_split(red_hat, test_size = 0.22, random_state = 42, stratify = red_hat['outcome'])

In [15]:
train_set = train_set.set_index('activity_index')
test_set = test_set.set_index('activity_index')

The unique combination of activity features is set as index to enable further identification of predicted outcomes for specific activities.

In [16]:
print(train_set.shape)
print(test_set.shape)

(1713886, 60)
(483405, 60)


In [17]:
del people_df
del train_activity_df

## 3. Data preprocessing

### 3.1. Correlation between features and target

Correlation between categorical variables will be tested using a number of statistical measures, including **chi-square** and **Cramer's V**. 

In [19]:
index_cols = ['activity_id_prefix', 'activity_id']
cat_cols = ['char_10_act' , 'group_1']
target = ['outcome']
continuous_cols = ['char_38']

In [20]:
#categorical variables to check correlation between them and target
cols_corr = [x for x in list(train_set.columns) if x not in (target+continuous_cols+index_cols)]

In [21]:
cols_corr

['people_id',
 'char_1_pep',
 'group_1',
 'char_2_pep',
 'char_3_pep',
 'char_4_pep',
 'char_5_pep',
 'char_6_pep',
 'char_7_pep',
 'char_8_pep',
 'char_9_pep',
 'char_10_pep',
 'char_11',
 'char_12',
 'char_13',
 'char_14',
 'char_15',
 'char_16',
 'char_17',
 'char_18',
 'char_19',
 'char_20',
 'char_21',
 'char_22',
 'char_23',
 'char_24',
 'char_25',
 'char_26',
 'char_27',
 'char_28',
 'char_29',
 'char_30',
 'char_31',
 'char_32',
 'char_33',
 'char_34',
 'char_35',
 'char_36',
 'char_37',
 'month_pep',
 'year_pep',
 'weekend_flg_pep',
 'activity_category',
 'char_1_act',
 'char_2_act',
 'char_3_act',
 'char_4_act',
 'char_5_act',
 'char_6_act',
 'char_7_act',
 'char_8_act',
 'char_9_act',
 'char_10_act',
 'month_act',
 'year_act',
 'weekend_flg_act']

#### 3.1.1. Chi-square

In [22]:
def find_corr_chi2(df, cols_to_check, target, alpha=0.05):
    """ 
    The aim of the function is to find correlation between two categorical variables, using chi-square.
    
    The function takes in:
    - a dataframe with variables and target, 
    - a list of categorical columns, 
    - the name of the column with target (to check the correlation between categorical columns and target), 
    - an alpha parameter (with default=0.5).
    
    The output of the function is a list of variables which are highly correlated to the target.
    """
    
    
    cols_to_drop = []
    
    for col in cols_to_check:

        #chi-square value , p-value, degrees of freedom, expected frequencies
        stat, p, dof, expected = chi2_contingency(pd.crosstab(df[col],df[target]))
        print(f"{col}: significance={alpha}, p={p}")

        if p <= alpha:
            print(f'Target and {col} are associated')
            cols_to_drop.append(col)
        else:
            pass

        
    return cols_to_drop

In [23]:
chi2_cols = find_corr_chi2(train_set, cols_corr, 'outcome')

people_id: significance=0.05, p=0.0
Target and people_id are associated
char_1_pep: significance=0.05, p=0.0
Target and char_1_pep are associated
group_1: significance=0.05, p=0.0
Target and group_1 are associated
char_2_pep: significance=0.05, p=0.0
Target and char_2_pep are associated
char_3_pep: significance=0.05, p=0.0
Target and char_3_pep are associated
char_4_pep: significance=0.05, p=0.0
Target and char_4_pep are associated
char_5_pep: significance=0.05, p=0.0
Target and char_5_pep are associated
char_6_pep: significance=0.05, p=0.0
Target and char_6_pep are associated
char_7_pep: significance=0.05, p=0.0
Target and char_7_pep are associated
char_8_pep: significance=0.05, p=0.0
Target and char_8_pep are associated
char_9_pep: significance=0.05, p=0.0
Target and char_9_pep are associated
char_10_pep: significance=0.05, p=0.0
Target and char_10_pep are associated
char_11: significance=0.05, p=0.0
Target and char_11 are associated
char_12: significance=0.05, p=0.0
Target and char_

In [24]:
print(f"{len(chi2_cols)}/{len(cols_corr)} are associated to the target based on chi-squared")

55/56 are associated to the target based on chi-squared


Chi-square test turned out to classify **53/54 categorical variables as associated with the target**. But the small p-values might be associated with very large sample sizes, as Chi-square is sensitive to sample size.

Therefore, `chi2_cols` won't be dropped from the dataset and a different approach to the correlation between categorical variables will be tested.

#### 3.1.2. Cramer's V

In [25]:
def cramers_corrected_stat(cols_to_check, df, target, thresh):
    """ 
    The aim of the function is to calculate the corrected version of Cramer's V to find the level of association between categorical variables.
    
    The function takes in:
    - a dataframe with categorical variables, 
    - a list of categorical variables to check, 
    - the name of the column with target,
    - a threshold for Cramer's V values from which strong association will be assumed.
    
    The result of the function is a list of variables which are strongly associated with the target, according to the Cramer's V values.
    """
    
    cols_to_drop = []
    
    for col in cols_to_check:

        confusion_matrix = pd.crosstab(df[col],df[target])
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum().sum()
        phi2 = chi2/n
        r,c = confusion_matrix.shape
        phi2corr = max(0, phi2 - ((c-1)*(r-1))/(n-1))    
        rcorr = r - ((r-1)**2)/(n-1)
        ccorr = c - ((c-1)**2)/(n-1)
        cramers_v = np.sqrt(phi2corr / min( (ccorr-1), (rcorr-1)))
        
        if cramers_v > thresh:
            print(f'Target and {col} are associated: {round(cramers_v,2)}')
            cols_to_drop.append(col)
        else:
            pass
    
    return cols_to_drop

In [26]:
%%latex
$$ \text{The association between variables and target will by analized according to the following guidelines:} $$
$$ V \in[0,0.3] \text{ - weak association} $$
$$ V \in(0.3,0.5] \text{ - medium association} $$
$$ V > 0.5 \text{ - strong association} $$

<IPython.core.display.Latex object>

In [27]:
cramer_cols = cramers_corrected_stat(cols_corr, train_set, 'outcome', 0.5)

Target and people_id are associated: 0.92
Target and group_1 are associated: 0.95
Target and char_2_pep are associated: 0.68


In [28]:
cramer_cols

['people_id', 'group_1', 'char_2_pep']

Columns mentioned in `cramer_cols` will be deleted from the dataset as they are stronly associated with the target.

#### 3.1.3. Pearson correlation

Pearson correlation coefficient can be calculated either for continuous variables or for a categorical variable which has a 0/1-coding for the categories. This correlation is called **point-biserial correlation coefficient**.

It will be calculated for the only continuous variable in the dataset - `char_38` and the target.

In [29]:
def point_biserial_correlation(df, contin_cols, target, thresh):
    """
    The aim of the function is to calculate a point biserial correlation coefficient and the associated p-value.
    
    The function takes in:
    - a dataframe with variables for which we want to test correlation
    - a list of continuous variable
    - a binary variable (target)
    - a threshold for correlation coeficient from which strong correlation will be assumed.
    
    The result of the function is a list of variables with high correlation.
    """
    
    cols_to_drop = []
    
    for col in contin_cols:
    
        corr = pointbiserialr(df[col],df[target])[0]
        p = pointbiserialr(df[col],df[target])[1]
    
        if (p<=0.5):
            if (abs(corr)>thresh):
                print(f"{col} is correlated with the target: {round(corr,2)}")
                cols_to_drop.append(col)
            else:
                print(f"{col} with low correlation")
        else:
            print(f"No statistically significant correlation")
    
    return cols_to_drop

In [30]:
%%latex
$$ \text{The point biserial correlation coefficient between variables and target will by analized according to the following guidelines:} $$
$$ \mid P\mid  \in[0,0.3] \text{ - weak association} $$
$$ \mid P\mid \in(0.3,0.5] \text{ - medium association} $$
$$ \mid P\mid > 0.5 \text{ - strong association} $$

<IPython.core.display.Latex object>

In [31]:
point_biserial_cols = point_biserial_correlation(train_set, continuous_cols, 'outcome', 0.5)

char_38 is correlated with the target: 0.68


In [32]:
point_biserial_cols

['char_38']

### 3.2. Preparing the preprocessing pipeline

#### 3.2.1. Splitting the data

The dataset will be split to avoid overfitting the pipeline. Additionally, data types must be converted into strings to enable using the encoding methods.

In [33]:
X_train = train_set.drop('outcome', axis=1)
y_train = train_set['outcome']

In [34]:
X_test = test_set.drop('outcome', axis=1)
y_test = test_set['outcome']

In [35]:
del train_set
del test_set

In [36]:
# converting data types to objects
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [37]:
modelling_cols = [x for x in X_train.columns if x not in index_cols+cat_cols+cramer_cols+point_biserial_cols]

#### 3.2.2. Selecting columns for training

Data exploration indicated that two categorical features: `char_10_act` and `group_1` have too many categories to encode and therefore they will be dropped. Additionally, features connected to the index itself will be dropped as well, as they bring no value to the modelling process.

At this point variables associated with the target will be dropped as well.

In [38]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Extracts only a given list of columns and returns a filtered dataframe.
    """
    def __init__(self, feature_names):
        self._feature_names = feature_names 
    
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
        return X[self._feature_names] 

#### 3.2.3. Filling missing values

The missing values will be filled with a constant, in this case ***-1***, as it was previously done in the cleaning data step. The custom ValueImputer class is being created because SimpleImputer outputs a numpy array, while we need a DataFrame.

In [39]:
class ValueImputer(BaseEstimator, TransformerMixin):
    """
    Fills missing values with a constant.
    """
    def __init__(self, impute_value):
        self.impute_value = impute_value
    
    def fit(self, X, y = None):
        return self
        
    def transform(self, X, y = None):   
        return X.fillna(self.impute_value)

    def fit_transform(self, X, y = None):
        return X.fillna(self.impute_value)

#### 3.2.4. Encoding categorical variables

At this point all features are categorical. However, they all have a different number of categories. 

In [40]:
categories ={}

In [41]:
#find the number of categories for each variable
for cat in modelling_cols:
    if cat != 'outcome':
        categories[cat]= len((list(X_train[cat].unique())))

In [42]:
{k: v for k, v in sorted(categories.items(), key=lambda item: item[1])}

{'char_1_pep': 2,
 'char_10_pep': 2,
 'char_11': 2,
 'char_12': 2,
 'char_13': 2,
 'char_14': 2,
 'char_15': 2,
 'char_16': 2,
 'char_17': 2,
 'char_18': 2,
 'char_19': 2,
 'char_20': 2,
 'char_21': 2,
 'char_22': 2,
 'char_23': 2,
 'char_24': 2,
 'char_25': 2,
 'char_26': 2,
 'char_27': 2,
 'char_28': 2,
 'char_29': 2,
 'char_30': 2,
 'char_31': 2,
 'char_32': 2,
 'char_33': 2,
 'char_34': 2,
 'char_35': 2,
 'char_36': 2,
 'char_37': 2,
 'weekend_flg_pep': 2,
 'year_act': 2,
 'weekend_flg_act': 2,
 'year_pep': 4,
 'char_6_act': 6,
 'char_6_pep': 7,
 'activity_category': 7,
 'char_8_pep': 8,
 'char_4_act': 8,
 'char_5_act': 8,
 'char_5_pep': 9,
 'char_9_pep': 9,
 'char_7_act': 9,
 'month_pep': 12,
 'char_3_act': 12,
 'month_act': 12,
 'char_8_act': 19,
 'char_9_act': 20,
 'char_4_pep': 25,
 'char_7_pep': 25,
 'char_2_act': 33,
 'char_3_pep': 43,
 'char_1_act': 51}

The majority of variables are binary, but the number of categories for other features oscillate **between 4 and 47**.

- Variables with **binary categories**: convert to 0-1 values
- Variables with **3-10 categories**: frequency encoding
- Variables with **10 categories and more**: mean encoding

In [43]:
def define_cols_to_encode(cat_dict, thresh):
    """
    Defining the lists of columns to encode depending on the number of categories per feature.
    
    The function takes in:
    - a dictionary with name of columns as keys and number of categories as values
    - (min. number -1) of categories to include in frequency encoding and at the same time max. number of categories to include in binary encoding 
    """                       
    binary_cat = list({k for k, v in cat_dict.items() if v == 2})
    little_cat = list({k for k, v in cat_dict.items() if v in range(3,10)})
    big_cat = list({k for k, v in cat_dict.items() if v >= 10})
    
                          
    return little_cat, big_cat, binary_cat                   

In [44]:
thresh = 10

In [45]:
little_cat, big_cat, binary_cat = define_cols_to_encode(categories, thresh)

In [46]:
print(f"Number of features with binary categories: {len(binary_cat)}")
print(f"Number of features with less than {thresh} number of categories: {len(little_cat)}")
print(f"Number of features with {thresh} and more number of categories: {len(big_cat)}")

Number of features with binary categories: 32
Number of features with less than 10 number of categories: 10
Number of features with 10 and more number of categories: 10


**Binary categories encoding**

In [51]:
class Encoder01(BaseEstimator, TransformerMixin):
    """
    Encodes categorical variables using their frequencies.
    """
    def __init__(self, binary_cols):
        """
        Freq_cols is a list of columns which will be encoded using the Encoder01
        """
        self.binary_cols = binary_cols
    
    def fit(self, X, y = None):
        """
        The fit method takes in a DataFrame with features (X) and a numpy array with the target variable (y).
        
        It creates a dictionary, where keys are names of features and values are dictionaries (zipped uniques&zero_ones).
        In the zipped dictionary keys are the names of categories represented by a specific feature and the values are the new binary values : 0 or 1.
        """
        self.maps ={}
        for col in self.binary_cols:
            self.maps[col] = []
            uniques = sorted(list(X[col].unique()))
            zero_ones = ['0','1']
            self.maps[col]  = dict(zip(uniques, zero_ones)) 
        return self
        
    def transform(self, X, y = None):
        """
        The transform method takes in a DataFrame with features (X) and a numpy array with the target variable (y).
        
        The transform method replaces the names of categories with zeros or ones (using values stored in `map` dictionary).
        If a given category is not in the dictionary, it is encoded with "-1".
        """
        
        for var in self.maps.keys():
            try:
                X[var] = X[var].apply(lambda x: self.maps[var][x])
            except KeyError:
                X[var] = X[var].apply(lambda x: -1)
        return X
        
        
    def fit_transform(self, X, y = None):
        """
        Combines the above mentioned fit and transform methods.
        """
        return self.fit(X, y).transform(X, y)

**Frequency encoding**

In [52]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes categorical variables using their frequencies.
    """
    def __init__(self, freq_cols):
        """
        Freq_cols is a list of columns which will be encoded using the FrequencyEncoder
        """
        self.freq_cols = freq_cols
    
    def fit(self, X, y = None):
        """
        The fit method takes in a DataFrame with features (X) and a numpy array with the target variable (y).
        
        It creates a dictionary, where keys are names of features and values are dictionaries (zipped uniques&frequencies).
        In the zipped dictionary keys are the names of categories represented by a specific feature and the values are their frequencies of occurance in the set.
        """
        self.maps ={}
        for col in self.freq_cols:
            self.maps[col] = {}
            uniques = list(X[col].unique())
            frequencies = list(X.groupby(col).size()/ len(X))
            self.maps[col]  = dict(zip(uniques, [round(x,3) for x in frequencies])) 
        return self
        
    def transform(self, X, y = None):
        """
        The transform method takes in a DataFrame with features (X) and a numpy array with the target variable (y).
        
        The transform method replaces the names of categories with the frequencies of those categories in the dataset (using values stored in `map` dictionary).
        If a given category is not in the dictionary, it is encoded with "-1".
        """
        for var in self.maps.keys():
            try:
                X[var] = X[var].apply(lambda x: self.maps[var][x])
            except KeyError:
                X[var] = X[var].apply(lambda x: -1)
        return X

    def fit_transform(self, X, y = None):
        """
        Combines the above mentioned fit and transform methods.
        """
        return self.fit(X, y).transform(X, y)

**Mean encoder**

Mean encoder will be applied without changes from category_encoders.

### 3.3. Transfroming the dataset using the pipeline

The preprocessing operations from the pipeline will be fit and transformed using the training data and transformed - using the test data.

In [53]:
cat_pipeline = Pipeline([
        ('column_selector', ColumnSelector(modelling_cols)),
        ('imputer', ValueImputer("-1")),
        ('binary_encoder', Encoder01(binary_cat)),
        ('frequency_encoder', FrequencyEncoder(little_cat)),
        ('target_encoder', TargetEncoder(cols = big_cat, smoothing = 0.8))
    ])

In [54]:
X_train_t = cat_pipeline.fit_transform(X_train, y_train)

In [55]:
X_train_t.head()

Unnamed: 0_level_0,char_1_pep,char_3_pep,char_4_pep,char_5_pep,char_6_pep,char_7_pep,char_8_pep,char_9_pep,char_10_pep,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,month_pep,year_pep,weekend_flg_pep,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,month_act,year_act,weekend_flg_act
activity_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
"(2, 3769922)",1,0.36,0.36,0.025,0.329,0.467,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.576,0.091,0,0.072,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.465,0,0
"(2, 2255070)",1,0.522,0.469,0.162,0.037,0.519,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.477,0.091,0,0.412,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.471,0,1
"(2, 1102570)",1,0.578,0.535,0.064,0.329,0.39,0.311,0.249,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,1,1,0,0.45,0.091,0,0.072,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.345,0,0
"(2, 4143394)",1,0.36,0.36,0.025,0.329,0.467,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.5,0.189,1,0.195,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.494,1,1
"(2, 3554173)",1,0.36,0.36,0.025,0.329,0.39,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.45,0.091,0,0.412,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.385,0,0


In [56]:
del X_train

In [57]:
X_test_t = cat_pipeline.transform(X_test)

In [58]:
X_test_t.head()

Unnamed: 0_level_0,char_1_pep,char_3_pep,char_4_pep,char_5_pep,char_6_pep,char_7_pep,char_8_pep,char_9_pep,char_10_pep,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,month_pep,year_pep,weekend_flg_pep,activity_category,char_1_act,char_2_act,char_3_act,char_4_act,char_5_act,char_6_act,char_7_act,char_8_act,char_9_act,month_act,year_act,weekend_flg_act
activity_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
"(2, 2791174)",1,0.552,0.386,0.069,0.037,0.898,0.311,0.249,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,0,1,0,1,1,0.346,0.091,1,0.412,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.345,0,0
"(2, 4219859)",1,0.36,0.36,0.025,0.037,0.519,0.311,0.249,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,1,0,1,0,1,1,0.5,0.166,1,0.072,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.434,1,0
"(2, 2522827)",1,0.537,0.386,0.115,0.329,0.467,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.576,0.189,0,0.195,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.494,1,0
"(2, 908393)",1,0.36,0.36,0.025,0.303,0.104,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.5,0.554,0,0.195,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.543,1,0
"(2, 4370791)",1,0.522,0.557,0.055,0.012,0.059,0.066,0.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.346,0.091,0,0.412,0.446,0.446,0.446,0.928,0.928,0.928,0.928,0.446,0.446,0.345,0,0


In [59]:
del X_test

### 4. Saving datasets to csv

In [65]:
X_train_t.to_csv('./csv_files/red_hat_train.csv')
X_test_t.to_csv('./csv_files/red_hat_test.csv')

In [63]:
# X_train = pd.read_csv('./csv_files/red_hat_train.csv', index_col = 'activity_index')
# X_test = pd.read_csv('./csv_files/red_hat_test.csv', index_col = 'activity_index')