# Import necessary libraries

In [22]:
import pandas as pd
from datetime import date
import holidays
from datetime import datetime,timedelta
import warnings
warnings.filterwarnings("ignore") 

# Load data

In [23]:
data = pd.read_csv('raw/data_raw.csv',encoding='latin1')
item_master_data = pd.read_csv('raw/item_master.csv', encoding='latin1')

# Drop unnecessary columns


In [24]:
if(len(data.columns) == 17):
    data.drop(['Branch', 'Schm.Disc ', 'Bill Disc ', 'Doctor Name ', 'Patient Name ', 'Scheduler ', 'Batch No '], axis=1, inplace=True)

# Rename necessary common column

In [25]:
data.rename(columns={'Prod Code': 'Product Code', ' ': 'Customer ID', 'Prod Name': 'Product Name'}, inplace=True)

# Merge Dataframes to get subcat name

In [26]:
merged_df = pd.merge(data, item_master_data[['Product Code', 'SUB CAT NAME', 'CATEGORY NAME', 'CATEGORY TYPE']], on='Product Code', how='inner')
merged_df.rename(columns={'SUB CAT NAME': 'Sub Category Name', 'CATEGORY NAME': 'Category Name', 'CATEGORY TYPE': 'Category Type'}, inplace=True)

# Rename Category Types

In [27]:
replacement_dict = {'BVG': 'Beverage', 'F&V': 'Fruit and Vegetable', 'GM': 'General Merchandise', 'HCP': 'Home Care Products', 
                    'PCP': 'Personal Care Products', 'PKGFD': 'Packaged Food', 'RTE': 'Ready To Eat', 'STAPLES': 'Staples'}
merged_df['Category Type'] = merged_df['Category Type'].replace(replacement_dict)

# Remove NA customers and 1 Customers and convert column data types.

In [29]:
merged_df.dropna(subset=['Customer ID'], inplace=True)
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Customer ID'] = merged_df['Customer ID'].astype(int)
merged_df['Cost Rate'] = merged_df['Cost Rate'].round(2)

# Add necessary/requried columns 

In [23]:
try:
    merged_df.insert(1, 'Day', merged_df['Date'].dt.day_name())
    merged_df.insert(2, 'Month', merged_df['Date'].dt.strftime('%B'))
    merged_df.insert(3, 'Season', ' ')
    merged_df.insert(4, 'Festival', ' ')
    merged_df.insert(12, 'Margin', (merged_df['Sale Rate'] - merged_df['Cost Rate']).round(2))
except Exception:
    pass

# Seasons transformation

In [24]:
season = []
for x in merged_df['Date']:
    month = x.month
    if(month in [2, 3]):
        season.append('Spring')
    elif(month in [4, 5, 6]):
        season.append('Summer')
    elif(month in [7, 8, 9]):
        season.append('Monsoon')
    elif(month in [10, 11]):
        season.append('Autumn')
    else:
        season.append('Winter')
merged_df['Season'] = season

# Festival/Holiday Transformation

In [25]:
unique_years = merged_df['Date'].dt.year.unique().tolist()
holidays_dict = dict()
holidays_list = []
for date, name in sorted(holidays.IN(years=tuple(unique_years)).items()):
    holidays_dict[str(date)] = name

for x in merged_df['Date']:
    if(str(x)[:10] in holidays_dict):
        holidays_list.append(holidays_dict[str(x)[:10]])
    else:
        holidays_list.append('Normal Day')
merged_df['Festival'] = holidays_list

# Customer Segmentation

In [26]:
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
snapshot_day = merged_df['Date'].max() + timedelta(days=1)
days_ago_series = (snapshot_day-merged_df['Date']).astype('timedelta64[D]').astype(int)
RFM = merged_df.groupby('Customer ID').agg(Recency=('Date',lambda x :(snapshot_day - x.max())))
RFM['Frequency']= merged_df.groupby('Customer ID').agg(Frequency=('Bill No','count'))
RFM['Monetary'] = merged_df.groupby('Customer ID').agg(Monetary=('Amount','sum'))

recency_labels = range(4,0,-1) #lowest value is given the highest ranking coz it tells that person has bought recently 

RFM['R_score'] = pd.qcut(RFM['Recency'], q=4,labels=recency_labels)

Frequency_labels = range(1,5)

RFM['F_score'] = pd.qcut(RFM['Frequency'], q=4,labels=Frequency_labels)

Monetary_labels = range(1,5)

RFM['M_score'] = pd.qcut(RFM['Monetary'], q=4,labels=Monetary_labels)

RFM['rfm_total_score'] = RFM['R_score'].astype(int)+RFM['F_score'].astype(int)+RFM['M_score'].astype(int)

RFM_Segments = (RFM['R_score'].astype(str) + '.'+ RFM['F_score'].astype(str)+'.' + RFM['M_score'].astype(str))

RFM['Segments'] = RFM_Segments

def group_function(merged_df):
    if merged_df['rfm_total_score'] > 9:
        return "Elite Customers"
    elif merged_df['rfm_total_score'] > 5 and merged_df['F_score'] == 4:
        return 'Frequent Customers'
    elif merged_df['rfm_total_score'] <= 5 and merged_df['F_score'] == 1:
        return 'Rare Customer'
    elif merged_df['rfm_total_score'] <= 5:
        return "Cost-conscious Customers"
    else:
        return 'Moderate Value Customers'
RFM['Group']= RFM.apply(group_function,axis=1)
RFM = RFM.reset_index()

In [27]:
merged_df = merged_df.merge(RFM[['Customer ID', 'Group']], on='Customer ID', how='inner')

# CSV conversion

In [28]:
merged_df.to_csv('Transformed_data.csv', index_col = False)
merged_df

Unnamed: 0,Date,Day,Month,Season,Festival,Bill No,Product Code,Product Name,Qty,Cost Rate,Sale Rate,MRP,Margin,Amount,Customer ID,Sub Category Name,Category Name,Category Type,Group
0,2018-03-01,Thursday,March,Spring,Normal Day,136210,7402,AMUL FRESH MILK GOLD PP 500ML,2.0,25.20,26.0,26.0,0.80,52.0,1996,DAIRY,DAIRY & FROZEN,Packaged Food,Elite Customers
1,2018-02-09,Friday,February,Spring,Normal Day,128196,7402,AMUL FRESH MILK GOLD PP 500ML,2.0,25.20,26.0,26.0,0.80,52.0,1996,DAIRY,DAIRY & FROZEN,Packaged Food,Elite Customers
2,2017-11-08,Wednesday,November,Autumn,Normal Day,90188,7402,AMUL FRESH MILK GOLD PP 500ML,2.0,25.20,26.0,26.0,0.80,52.0,1996,DAIRY,DAIRY & FROZEN,Packaged Food,Elite Customers
3,2017-11-15,Wednesday,November,Autumn,Normal Day,93083,7402,AMUL FRESH MILK GOLD PP 500ML,2.0,25.20,26.0,26.0,0.80,52.0,1996,DAIRY,DAIRY & FROZEN,Packaged Food,Elite Customers
4,2017-11-20,Monday,November,Autumn,Normal Day,95224,7402,AMUL FRESH MILK GOLD PP 500ML,2.0,25.20,26.0,26.0,0.80,52.0,1996,DAIRY,DAIRY & FROZEN,Packaged Food,Elite Customers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551196,2017-09-13,Wednesday,September,Monsoon,Normal Day,67626,4646,KALYAN BHEL 350 GM,1.0,112.00,140.0,140.0,28.00,140.0,16,NAMKEENS,NAMKEEN/CHIPS/SWEETS,Ready To Eat,Rare Customer
551197,2017-04-02,Sunday,April,Summer,Normal Day,241,3921,YRDLEY DEO GENTLEMAN 150ML,1.0,142.87,199.0,199.0,56.13,199.0,203,DEO & ANTIPERSPIRANTS,COSMETIC & BEAUTY,Personal Care Products,Rare Customer
551198,2017-04-02,Sunday,April,Summer,Normal Day,241,13274,YRDLEY DEO CITRS WOD MN 150ML,1.0,142.87,199.0,199.0,56.13,199.0,203,DEO & ANTIPERSPIRANTS,COSMETIC & BEAUTY,Personal Care Products,Rare Customer
551199,2017-05-16,Tuesday,May,Summer,Normal Day,19297,537,KINLEY WATER 500ML,1.0,0.00,10.0,10.0,10.00,10.0,7,CARBONATED,CARBONATED/ STILL DRINK,Beverage,Rare Customer
