# Working With Strings

The following dataset contains users and codes

## Importing the Libraries

In [1]:
# General Libraries
import pandas as pd

In [2]:
# Yeast specifics classes
from yeast import Recipe
from yeast.steps import *
from yeast.transformers import *
from yeast.aggregations import *

## Getting the Data

In [3]:
codes = pd.read_csv('string_codes.csv')
codes.head()

Unnamed: 0,user,code
0,0,NNNNN
1,1,
2,2,ANPNN
3,3,ANGNN
4,4,ANPNN


## Cleaning the Data
### Defining the processing Recipe

In [4]:
recipe = Recipe([
    # Trap: the column "code" on the csv is "  code"
    # Cleaning the column names should fix this
    CleanColumnNamesStep('snake'),
    # Let's clean the Code according to the business rules:
    MutateStep({
        # Transform the "name" column
        'code': [
            # No whitespace to the left or right of the string
            StrTrim(),
            # The code must have 5 characters, 'N' if no information
            StrPad(5, side='right', pad='N', column="code"),
            # Whitespaces are also coded as 'N',
            StrReplaceAll(' ', 'N')
            # TODO: Replace NA with "NNNNN" ReplaceNAStep()
            # ...
        ],
        # Extract the first letter of the code (Account)
        'code_account': StrSlice(0, 1, column='code'),
        # Extract the third letter of the code (Account Type) if Account == 'A'
        'code_type': StrSlice(2, 3, column='code'),
    })
])

In [5]:
recipe = recipe.prepare(codes)

In [9]:
clean_codes = recipe.bake(codes)
clean_codes.head()

Unnamed: 0,user,code,code_account,code_type
0,0,NNNNN,N,N
1,1,,,
2,2,ANPNN,A,P
3,3,ANGNN,A,G
4,4,ANPNN,A,P


### How many types of accounts do we have?

In [7]:
group_recipe = Recipe([
    # Keep Only Accounts with Type
    FilterStep('code_account == "A"'),
    # Group by Type
    GroupByStep('code_type'),
    # Count the types
    SummarizeStep({
        'account_type_count': AggCount('code_type')
    }),
    # Sort by count
    SortStep('account_type_count', ascending=False)
])

In [8]:
group_codes = group_recipe.bake(clean_codes)
group_codes.head(n=15)

Unnamed: 0,code_type,account_type_count
0,G,6
1,P,4
2,B,3
