### 1. Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'G:\My Drive\Python Challenges\Alteryx Challenges in Python\402'

In [3]:
tags = pd.read_csv(os.path.join('tags.csv'), index_col=False)

### 2. Data Validation

In [4]:
tags.head()

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure
0,P101A/B/C,Oil,35,2000
1,P203A-D,Gas,41,3000
2,P401A/G,N2,10,2100
3,T101B,FW,40,1500
4,B301,CW,23,2400


In [5]:
tags.shape

(5, 4)

### 3. Data Transformation

In the Engineering, Procurement, and Construction (EPC) department of a chemical plant, equipment tags play a crucial role in managing equipment, including labeling pumps as distinct entities, such as P-101A and P-101B. The upstream Design department still prefers the old-fashioned way of creating tags and they usually concatenate tags into forms like P-101A/B or P-101A-B. This approach creates a headache for the downstream department's data processing.

Your task this week is to parse these concatenated equipment tags, making life smoother for everyone involved. Ensure the new tags start from the first letter and end with the last letter sequentially.

Create an Alteryx workflow to sequentially generate new equipment tags based on the provided dataset. For instance, if you have P-101A/E, the output should be P-101A, P-101B, P-101C, P-101D, and P-101E.

In [6]:
# Creating a scaffold of the lowest to highest letters seen in the data. Because the data is so small, just typing them out.
alphabet_scaffold = pd.DataFrame({'letter' : ['A', 'B', 'C', 'D', 'E', 'F', 'G']})

In [7]:
alphabet_scaffold.head(10)

Unnamed: 0,letter
0,A
1,B
2,C
3,D
4,E
5,F
6,G


In [8]:
# Creating numeric index column to assign to each letter
alphabet_scaffold.reset_index(inplace=True)

In [9]:
alphabet_scaffold.head(10)

Unnamed: 0,index,letter
0,0,A
1,1,B
2,2,C
3,3,D
4,4,E
5,5,F
6,6,G


In [10]:
tags.dtypes

tag                object
Service            object
WorkingTemp         int64
WorkingPressure     int64
dtype: object

In [11]:
# Making sure the columns aren't objects. Annoying to work with.
tags = tags.astype('string')

In [12]:
tags.dtypes

tag                string[python]
Service            string[python]
WorkingTemp        string[python]
WorkingPressure    string[python]
dtype: object

In [13]:
# Creating column that gives the last letter of the tag, so we know where the scaffolding should end
tags['last_letter'] = tags['tag'].str[-1]

In [14]:
# Creating flag column telling us whether or not the tag actually needs letters in between, denoted by the presence of / or -
tags.loc[(tags['tag'].str.contains('/', regex=False)) | (tags['tag'].str.contains('-', regex=False)), 'needs_scaffold'] = 'yes'

In [15]:
# Grabbing just the subset that does need scaffolding
tags_split = tags.loc[tags['needs_scaffold']=='yes'].copy()

In [16]:
# These don't, but obv we need to keep them
tags_na = tags.loc[tags['needs_scaffold'].isnull()].copy()

In [17]:
tags_split

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure,last_letter,needs_scaffold
0,P101A/B/C,Oil,35,2000,C,yes
1,P203A-D,Gas,41,3000,D,yes
2,P401A/G,N2,10,2100,G,yes


In [18]:
# CROSS JOINing the tags that need scaffolding with the alphabet scaffold
tags_merged = tags_split.merge(alphabet_scaffold, how='cross')

In [19]:
# Doing something kinda fancy: joining alphabet scaffold on AGAIN but as a left join on last letter,
# so we just get that last letter ALONG WITH its index number.
tags_merged = tags_merged.merge(alphabet_scaffold, left_on=['last_letter'], right_on=['letter'], how='left')

In [20]:
tags_merged

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure,last_letter,needs_scaffold,index_x,letter_x,index_y,letter_y
0,P101A/B/C,Oil,35,2000,C,yes,0,A,2,C
1,P101A/B/C,Oil,35,2000,C,yes,1,B,2,C
2,P101A/B/C,Oil,35,2000,C,yes,2,C,2,C
3,P101A/B/C,Oil,35,2000,C,yes,3,D,2,C
4,P101A/B/C,Oil,35,2000,C,yes,4,E,2,C
5,P101A/B/C,Oil,35,2000,C,yes,5,F,2,C
6,P101A/B/C,Oil,35,2000,C,yes,6,G,2,C
7,P203A-D,Gas,41,3000,D,yes,0,A,3,D
8,P203A-D,Gas,41,3000,D,yes,1,B,3,D
9,P203A-D,Gas,41,3000,D,yes,2,C,3,D


In [21]:
# Indexes come in clutch here as we only keep the scaffolded tags where the letters are at or below the last letter of the tag
tags_merged = tags_merged.loc[tags_merged['index_x']<=tags_merged['index_y']]

In [22]:
# Keeping only necessary columns
tags_merged = tags_merged[['tag', 'Service', 'WorkingTemp', 'WorkingPressure', 'letter_x']]

In [23]:
tags_merged

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure,letter_x
0,P101A/B/C,Oil,35,2000,A
1,P101A/B/C,Oil,35,2000,B
2,P101A/B/C,Oil,35,2000,C
7,P203A-D,Gas,41,3000,A
8,P203A-D,Gas,41,3000,B
9,P203A-D,Gas,41,3000,C
10,P203A-D,Gas,41,3000,D
14,P401A/G,N2,10,2100,A
15,P401A/G,N2,10,2100,B
16,P401A/G,N2,10,2100,C


In [24]:
tags_merged['tag'] = tags_merged['tag'].str[0:4] + tags_merged['letter_x']

In [31]:
tags_merged

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure,letter_x
0,P101A,Oil,35,2000,A
1,P101B,Oil,35,2000,B
2,P101C,Oil,35,2000,C
7,P203A,Gas,41,3000,A
8,P203B,Gas,41,3000,B
9,P203C,Gas,41,3000,C
10,P203D,Gas,41,3000,D
14,P401A,N2,10,2100,A
15,P401B,N2,10,2100,B
16,P401C,N2,10,2100,C


In [26]:
# Appending the ones that didn't need scaffolding back on
solution = pd.concat([tags_merged, tags_na])

In [27]:
# Keeping only final columns
solution = solution[['tag', 'Service', 'WorkingTemp', 'WorkingPressure']]

In [28]:
solution.reset_index(drop=True, inplace=True)

In [29]:
solution.to_csv(os.path.join('solution.csv'), index=False)

In [30]:
solution

Unnamed: 0,tag,Service,WorkingTemp,WorkingPressure
0,P101A,Oil,35,2000
1,P101B,Oil,35,2000
2,P101C,Oil,35,2000
3,P203A,Gas,41,3000
4,P203B,Gas,41,3000
5,P203C,Gas,41,3000
6,P203D,Gas,41,3000
7,P401A,N2,10,2100
8,P401B,N2,10,2100
9,P401C,N2,10,2100
