In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

In [18]:
Holidays = pd.read_excel('Holidays.xlsx')
Shipment = pd.read_csv('Shipment_upto35.csv')

In [46]:
Shipment.shape

(351457, 6)

In [36]:
Holidays.head(4)

Unnamed: 0,Date,Day_week,Holiday,isHoliday,isObserved,Area
0,2010-01-01,Friday,New Year's Day,1.0,,
1,2010-01-02,Saturday,Ordinary,,,
2,2010-01-03,Sunday,Ordinary,,,
3,2010-01-04,Monday,2nd January,1.0,,Scotland


In [14]:
Shipment.head(3)

Unnamed: 0,APG,APG:FU,Actuals,DP Lag4,FU,Week
0,UGB001,UGB001##3100:IGB0007,1170.0,1572.5,IGB0007,201601
1,UGB001,UGB001##3100:IGB0007,1950.0,2212.9,IGB0007,201602
2,UGB001,UGB001##3100:IGB0007,2145.0,1459.5,IGB0007,201603


In [19]:
Shipment['Week'] = Shipment['Week'].astype(str)

In [8]:
Holidays.dtypes

Date           object
Day_week       object
Holiday        object
isHoliday     float64
isObserved    float64
Area           object
dtype: object

In [20]:
Holidays['Week'] = pd.to_datetime(Holidays['Date']).dt.strftime('%G%V')

In [67]:
def year_week(column):
    week_number = column.map(lambda x: ((x - datetime.datetime(x.year, 1, 1)).days // 7) + 1).astype(str)
    week_number = week_number.map(lambda x: x if len(x) > 1 else '0' + x)
    return column.map(lambda x: x.year).astype(str) + week_number   

In [93]:
Holidays['Week'] = year_week(Holidays.Date)

In [72]:
Holidays.head(3)

Unnamed: 0,Date,Day_week,Holiday,isHoliday,isObserved,Area,Week
0,2010-01-01,Friday,New Year's Day,1.0,,,201001
1,2010-01-02,Saturday,Ordinary,,,,201001
2,2010-01-03,Sunday,Ordinary,,,,201001


In [238]:
Holidays.columns

Index(['Date', 'Day_week', 'Holiday', 'isHoliday', 'isObserved', 'Area',
       'Week'],
      dtype='object')

## Add aggregate columns about holidays to table Shipment

#### Week_qty

In [57]:
Holidays.head(6)

Unnamed: 0,Date,Day_week,Holiday,isHoliday,isObserved,Area
0,2010-01-01,Friday,New Year's Day,1.0,,
1,2010-01-02,Saturday,Ordinary,,,
2,2010-01-03,Sunday,Ordinary,,,
3,2010-01-04,Monday,2nd January,1.0,,Scotland
4,2010-01-05,Tuesday,Ordinary,,,
5,2010-01-06,Wednesday,Ordinary,,,


#### add column 'ALL_holidays' (ALL_holidays in one week)

In [5]:
def add_f(x):
     return pd.Series(dict(ALL_holidays = ', '.join(x['Holiday'])))

ALL_holidays = Holidays[Holidays.Holiday != 'Ordinary'].groupby('Week').apply(add_f)

In [21]:
Holidays2 = Holidays[Holidays.Holiday != 'Ordinary'].groupby(
                   ['Week']
                ).agg( Week_qty = pd.NamedAgg(column = 'isHoliday', aggfunc= sum),
                         All_holiday = pd.NamedAgg(column = 'Holiday', aggfunc= list)
                ).reset_index()

In [22]:
Shipment = Shipment.merge(Holidays2[['Week', 'All_holiday', 'Week_qty']], on = 'Week', how = 'left')

In [23]:
Shipment.drop('DP Lag4', axis = 1, inplace = True)

In [24]:
Shipment['All_holiday'] = Shipment['All_holiday'].fillna(value = 'Ordinal')
Shipment['Week_qty'] = Shipment['Week_qty'].fillna(value = 0)

In [280]:
Shipment.head(3)

Unnamed: 0,APG,APG:FU,Actuals,FU,Week,All_holiday,Week_qty
0,UGB001,UGB001##3100:IGB0007,1170.0,IGB0007,201601,[2nd January],1.0
1,UGB001,UGB001##3100:IGB0007,1950.0,IGB0007,201602,Ordinal,0.0
2,UGB001,UGB001##3100:IGB0007,2145.0,IGB0007,201603,Ordinal,0.0


## Label encoding

### Multioutput classification:

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=Holidays.Holiday.unique())

In [16]:
Shipment = Shipment.join(pd.DataFrame(mlb.fit_transform(Shipment.All_holiday),
                          columns = mlb.classes_,
                          index = Shipment.index)).drop('All_holiday', axis = 1)

In [17]:
Shipment.head(5)

Unnamed: 0,APG,APG:FU,Actuals,FU,Week,Week_qty,New Year's Day,Ordinary,2nd January,St. David's Day,...,New Year's Day' observed,New Year's Day Holiday,Royal Wedding Bank Holiday,St Patrick's Day' observed,The Queen's Diamond Jubilee,St Andrew's Day' observed,Battle of the Boyne' observed,Victory in Europe Day,"Boxing Day, Bank Holiday",Wimbledon
0,UGB001,UGB001##3100:IGB0007,1170.0,IGB0007,201601,1.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,UGB001,UGB001##3100:IGB0007,1950.0,IGB0007,201602,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,UGB001,UGB001##3100:IGB0007,2145.0,IGB0007,201603,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,UGB001,UGB001##3100:IGB0007,1170.0,IGB0007,201604,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,UGB001,UGB001##3100:IGB0007,2340.0,IGB0007,201605,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
Shipment['All_holiday'].unique()

TypeError: unhashable type: 'list'

In [26]:
Shipment.assign(**pd.get_dummies(Shipment.All_holiday.apply(lambda x:pd.Series(x))\.stack().reset_index(level=1,drop=True)).sum(level=0))

Unnamed: 0,APG,APG:FU,Actuals,FU,Week,All_holiday,Week_qty,2nd January,Bank Holiday,Battle of the Boyne,...,Ordinal,Remembrance Sunday,Spring Bank Holiday,St Andrew's Day,St Patrick's Day,St Patrick's Day' observed,St. David's Day,St. George's Day,Summer Bank Holiday,Wimbledon
0,UGB001,UGB001##3100:IGB0007,1170.0,IGB0007,201601,[2nd January],1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,UGB001,UGB001##3100:IGB0007,1950.0,IGB0007,201602,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,UGB001,UGB001##3100:IGB0007,2145.0,IGB0007,201603,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,UGB001,UGB001##3100:IGB0007,1170.0,IGB0007,201604,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,UGB001,UGB001##3100:IGB0007,2340.0,IGB0007,201605,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351452,UGB999,UGB999##3100:IGB5023,0.0,IGB5023,201835,[Summer Bank Holiday],1.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
351453,UGB999,UGB999##3100:IGB5035,0.0,IGB5035,201831,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
351454,UGB999,UGB999##3100:IGB5035,0.0,IGB5035,201832,[Summer Bank Holiday],1.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
351455,UGB999,UGB999##3100:IGB5035,0.0,IGB5035,201833,Ordinal,0.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for i in range(0,X.shape[1]):
    if X.dtypes[i]=='object':
        X[X.columns[i]] = le.fit_transform(X[X.columns[i]])

In [262]:
Holidays2.columns = Holidays2.columns.map('.'.join)

In [265]:
Holidays2.columns.to_flat_index()

Index(['i.s.H.o.l.i.d.a.y', 'H.o.l.i.d.a.y'], dtype='object')

In [243]:
Holidays_agg = Holidays_agg.merge(ALL_holidays, left_on='Week', right_index = True, how = 'left')
Shipment = Shipment.merge(ALL_holidays, left_on='Week', right_index = True, how = 'left')

In [None]:
#### find the %, Week_qty / Year

In [157]:
Holidays_agg['Year'] = Holidays_agg.Week.map(lambda x: x[0:4])
Shipment['Year'] = Shipment.Week.map(lambda x: x[0:4])
Holidays['Year'] = Holidays.Week.map(lambda x: x[0:4])

In [158]:
Holidays_year = Holidays.groupby('Year').Week_qty.sum().to_frame()

In [159]:
Holidays_year = Holidays_year.rename(columns={"Week_qty": "Year_qty"}).reset_index()

In [160]:
Holidays_agg = Holidays_agg.merge(Holidays_year, on='Year', how = 'left')
Shipment = Shipment.merge(Holidays_year, on='Year', how = 'left')
Holidays = Holidays.merge(Holidays_year, on='Year', how = 'left')

### Merge dataframes:

In [None]:
left_merge = pd.merge(Shipment, Holidays, on ='Week', how = 'left')
right_merge = pd.merge(Shipment, Holidays, on ='Week', how = 'right')
inner_merge = pd.merge(Shipment, Holidays, on ='Week', how = 'inner')
outer_merge = pd.merge(Shipment, Holidays, on ='Week', how = 'outer')

### Diff column to merge:

In [175]:
Holidays['Week2'] = Holidays.Week

In [None]:
left_merge2 = pd.merge(Shipment, Holidays[['Date', 'Day_week', 'Holiday']], left_on ='Week', right_on = 'Week2', how = 'left')
right_merge2 = pd.merge(Shipment, Holidays[['Date', 'Day_week', 'Holiday']], left_on ='Week', right_on = 'Week2', how = 'right')
inner_merge2 = pd.merge(Shipment, Holidays[['Date', 'Day_week', 'Holiday']], left_on ='Week', right_on = 'Week2', how = 'inner')
outer_merge2 = pd.merge(Shipment, Holidays[['Date', 'Day_week', 'Holiday']], left_on ='Week', right_on = 'Week2', how = 'outer')

## Merge with 2 column:

In [174]:
Shipment['Week2'] = Shipment.Week

In [176]:
left_merge = pd.merge(Shipment, Holidays, on = ['Week', 'Week2'], how = 'left')
right_merge = pd.merge(Shipment, Holidays, on = ['Week', 'Week2'], how = 'right')
inner_merge = pd.merge(Shipment, Holidays, on = ['Week', 'Week2'], how = 'inner')
outer_merge = pd.merge(Shipment, Holidays, on = ['Week', 'Week2'], how = 'outer')

## Merge with 2 indexes:

In [108]:
# create indexes for example:
Shipment.set_index(['Week', Shipment.Week2.map(lambda x: x[0:4])], inplace = True)
Holidays.set_index(['Week', Holidays.Week2.map(lambda x: x[0:4])], inplace = True)

In [113]:
Shipment.index.names = ['Week', 'Year']
Holidays.index.names = ['Week', 'Year']

In [123]:
Shipment.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,APG,APG:FU,Actuals,DP Lag4,FU,Week2
Week,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201601,2016,UGB001,UGB001##3100:IGB0007,1170.0,1572.5,IGB0007,201601
201602,2016,UGB001,UGB001##3100:IGB0007,1950.0,2212.9,IGB0007,201602
201603,2016,UGB001,UGB001##3100:IGB0007,2145.0,1459.5,IGB0007,201603
201604,2016,UGB001,UGB001##3100:IGB0007,1170.0,2162.3,IGB0007,201604
201605,2016,UGB001,UGB001##3100:IGB0007,2340.0,8097.5,IGB0007,201605


In [125]:
index_merge = pd.merge(Shipment, Holidays, left_index=True, right_index=True)

### Join the two dataframes along rows:

In [79]:
df_new = pd.concat([Holidays[Holidays.Day_week == 'Friday'], Holidays[Holidays.Day_week == 'Sunday']])

### 3 dataframes:

In [81]:
# create third table
Holidays2 = Holidays[Holidays.Day_week == 'Friday']

In [82]:
three_df = Shipment.merge(Holidays, on='Week').merge(Holidays2, on='Week')

In [12]:
Holidays.head(3)

Unnamed: 0,Date,Day_week,Holiday,isHoliday,isObserved,Area,Week
0,2010-01-01,Friday,New Year's Day,1.0,,,200953
1,2010-01-02,Saturday,Ordinary,,,,200953
2,2010-01-03,Sunday,Ordinary,,,,200953


In [74]:
Holidays_new.head(3)

Unnamed: 0,Week,Holiday
0,200953,New Year's Day
1,200953,Ordinary
2,200953,Ordinary


In [76]:
Holidays.head(6)

Unnamed: 0,Date,Day_week,Holiday,isHoliday,isObserved,Area,Week
0,2010-01-01,Friday,,1.0,,,200953
1,2010-01-02,Saturday,,,,,200953
2,2010-01-03,Sunday,,,,,200953
3,2010-01-04,Monday,,1.0,,Scotland,201001
4,2010-01-05,Tuesday,,,,,201001
5,2010-01-06,Wednesday,,,,,201001


In [75]:
def add_f(x):
     return pd.Series(dict(ALL_holidays = ', '.join(x['Holiday'])))

Holidays.groupby('Week').apply(add_f)

TypeError: sequence item 0: expected str instance, float found

In [42]:
def FUNC(df, col1, col2, col3, col4):
    df[col1] = (df[col2].mean() - df[col3].mean())*df[col4]
    return df

In [46]:
FUNC(people, 'f', 'a', 'b', 'c')

Unnamed: 0,a,b,c,d,e,f
Joe,0.377656,0.115714,-0.080537,1.308866,0.745784,-0.068717
Steve,1.7903,0.810126,-0.300158,-0.525133,0.254108,-0.256107
Wes,-0.105722,0.277302,-1.871671,-1.609772,-0.208512,-1.596986
Jim,1.306913,-2.40365,-1.272184,-0.046708,-1.407395,-1.085479
Travis,1.022819,1.326271,-0.638691,0.481529,-1.286312,-0.544957


In [None]:
http://qaru.site/questions/65686/label-encoding-across-multiple-columns-in-scikit-learn
https://stackoverflow.com/questions/45570632/split-list-into-column/45571082#45571082
http://qaru.site/questions/458791/how-to-one-hot-encode-from-a-pandas-column-containing-a-list    