## Read in data

In [1]:
import pandas as pd
import numpy as np

In [2]:
oo = pd.read_csv('../data/online_retail_small.csv')

In [3]:
d1 = oo.sample(n=4)

In [4]:
d2 = oo.sample(n=4)

## Testing column splitter

In [8]:
d2 = oo.sample(n=4)

In [9]:
d2

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
3207,536609,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2/12/2010 9:41,4.25,17850.0,United Kingdom
1648,536544,22728,ALARM CLOCK BAKELIKE PINK,1,1/12/2010 14:32,7.62,,United Kingdom
4164,536750,15056BL,EDWARDIAN PARASOL BLACK,6,2/12/2010 14:04,4.95,17850.0,United Kingdom
4482,536784,22752,SET 7 BABUSHKA NESTING BOXES,72,2/12/2010 15:20,7.65,15061.0,United Kingdom


In [10]:
d2.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [64]:
def column_splitter(df, col_name, regex_string="(?=-)"):
    org_col_index = df.columns
    split_id = np.argwhere(org_col_index == col_name).reshape(1)[0]
    
    # split the original df into three sections: 
    # to_keep, col_to_split and cols_to_join_back
    to_keep = df.iloc[:, :split_id].copy(deep=True)
    to_split = df[[col_name]].copy(deep=True)
    to_join = df.iloc[:, (split_id+1):].copy(deep=True)
    cols_to_add = np.hstack((org_col_index[(split_id+1):], ''))
    to_join[''] = pd.Series([np.NaN]*to_join.shape[0])
    
    #split the column:
    to_split = to_split[col_name].str.split(regex_string, n=1, expand=True)
    to_split.columns = [col_name, cols_to_add[0]]
    
    # join the split column back first
    #to_keep = pd.concat([to_keep, to_split], axis=1)
    to_keep = to_keep.join(to_split)
    na_boolean = to_keep[cols_to_add[0]].isna()
    to_keep = to_keep.combine_first(to_join[[cols_to_add[0]]])
    #breakpoint()
    
    for i in np.arange(1,len(cols_to_add)):
        #print(i)
        new_col = to_join.iloc[:, i].copy(deep=True)
        new_col[~na_boolean] = to_join.iloc[:, i-1][~na_boolean]
        to_keep = pd.concat([to_keep, new_col], axis=1)
        #print(cols_to_add[i])
    #print(cols_to_add)
    
    return to_keep[np.hstack((org_col_index, ''))]

In [65]:
column_splitter(d2, 'Description')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 9
3207,536609,21730,GLASS STAR FROSTED T,-LIGHT HOLDER,6.0,2/12/2010 9:41,4.25,17850.0,United Kingdom
1648,536544,22728,ALARM CLOCK BAKELIKE PINK,1,1/12/2010 14:32,7.62,,United Kingdom,
4164,536750,15056BL,EDWARDIAN PARASOL BLACK,6,2/12/2010 14:04,4.95,17850.0,United Kingdom,
4482,536784,22752,SET 7 BABUSHKA NESTING BOXES,72,2/12/2010 15:20,7.65,15061.0,United Kingdom,


In [63]:
tmp

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 9
3207,536609,21730,GLASS STAR FROSTED T,-LIGHT HOLDER,6.0,2/12/2010 9:41,4.25,17850.0,United Kingdom
1648,536544,22728,ALARM CLOCK BAKELIKE PINK,1,1/12/2010 14:32,7.62,,United Kingdom,
4164,536750,15056BL,EDWARDIAN PARASOL BLACK,6,2/12/2010 14:04,4.95,17850.0,United Kingdom,
4482,536784,22752,SET 7 BABUSHKA NESTING BOXES,72,2/12/2010 15:20,7.65,15061.0,United Kingdom,


In [46]:
tmp[['Description', 'InvoiceNo']]

Unnamed: 0,Description,InvoiceNo
3207,GLASS STAR FROSTED T,536609
1648,ALARM CLOCK BAKELIKE PINK,536544
4164,EDWARDIAN PARASOL BLACK,536750
4482,SET 7 BABUSHKA NESTING BOXES,536784


In [16]:
tmp

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Unnamed: 9
4719,536798,21899,KEY FOB,", GARAGE DESIGN",2/12/2010 15:55,0.65,17838.0,United Kingdom,
747,536446,20777,CHRYSANTHEMUM NOTEBOOK,,2.0,1/12/2010 12:15,1.65,15983.0,United Kingdom
2132,536561,22274,FELTCRAFT DOLL EMILY,,6.0,1/12/2010 15:06,2.95,12921.0,United Kingdom
3027,536592,90166,PINK & WHITE ROSEBUD RING,,1.0,1/12/2010 17:06,4.24,,United Kingdom


In [None]:
pdb.runcall(column_splitter, d2, 'Description')

In [77]:
tmp.iloc[:, 5]

1097    NaN
4718    NaN
3193    NaN
4921    NaN
Name: , dtype: object

In [10]:
pd.DataFrame.combine_first?

[0;31mSignature:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m.[0m[0mcombine_first[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mother[0m[0;34m:[0m [0;34m'DataFrame'[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Update null elements with value in the same location in `other`.

Combine two DataFrame objects by filling null values in one DataFrame
with non-null values from other DataFrame. The row and column indexes
of the resulting DataFrame will be the union of the two.

Parameters
----------
other : DataFrame
    Provided DataFrame to use to fill null values.

Returns
-------
DataFrame

See Also
--------
DataFrame.combine : Perform series-wise operation on two DataFrames
    using a given function.

Examples
--------
>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
>>> df1.combine_first(df2)
     A    B
0  1.0  3.0
1  0.0  4.0

Null values still persist if the lo

In [30]:
id = pd.Series([True, False, False, True])

In [31]:
id.index = qty_series.index

In [32]:
qty_series[id] = d2.Description[id]

In [33]:
qty_series

3159    FULL ENGLISH BREAKFAST PLATE
3348                            12.0
4596                             2.0
4405     ZINC METAL HEART DECORATION
Name: Quantity, dtype: object

In [24]:
d2a = d2.Description.str.split('-',n=1, expand=True)

In [25]:
d2a[[1]]

Unnamed: 0,1
4608,
1675,
4451,
1762,LIGHT HLDR


In [26]:
d2[["Quantity"]]

Unnamed: 0,Quantity
4608,1
1675,1
4451,108
1762,4


In [28]:
d2.combine_first?

[0;31mSignature:[0m [0md2[0m[0;34m.[0m[0mcombine_first[0m[0;34m([0m[0mother[0m[0;34m:[0m [0;34m'DataFrame'[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Update null elements with value in the same location in `other`.

Combine two DataFrame objects by filling null values in one DataFrame
with non-null values from other DataFrame. The row and column indexes
of the resulting DataFrame will be the union of the two.

Parameters
----------
other : DataFrame
    Provided DataFrame to use to fill null values.

Returns
-------
DataFrame

See Also
--------
DataFrame.combine : Perform series-wise operation on two DataFrames
    using a given function.

Examples
--------
>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
>>> df1.combine_first(df2)
     A    B
0  1.0  3.0
1  0.0  4.0

Null values still persist if the location of that null value
does not exist in `other`

>

In [27]:
pd.concat([d2a[[1]],d2[["Quantity"]]], axis=1)

Unnamed: 0,1,Quantity
4608,,1
1675,,1
4451,,108
1762,LIGHT HLDR,4


In [65]:
oo.Description.str.contains("[^\w\s]", regex=True).isna().sum()

12

In [68]:
oo2 = oo[~oo.Description.isna()]

In [73]:
tmp = oo2[oo2.Description.str.contains("[^\w\s]", regex=True)].Description

In [77]:
tmp.str.split("(?=-)",n=1, expand=True)

Unnamed: 0,0,1
0,WHITE HANGING HEART T,-LIGHT HOLDER
4,RED WOOLLY HOTTIE WHITE HEART.,
6,GLASS STAR FROSTED T,-LIGHT HOLDER
10,POPPY'S PLAYHOUSE BEDROOM,
11,POPPY'S PLAYHOUSE KITCHEN,
...,...,...
4969,SET OF 6 T,-LIGHTS SNOWMEN
4982,DINOSAUR PARTY BAG + STICKER SET,
4986,EUCALYPTUS & PINECONE WREATH,
4988,"SWISS ROLL TOWEL, CHOCOLATE SPOTS",
