In [27]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [28]:
df1 = pd.read_csv('clean_set.csv') 

In [29]:
df2 = pd.read_csv('weather.csv') 

In [30]:
df3 = pd.read_csv('holidays.csv') 

In [31]:
df_inner_1 = pd.merge(df1, df2, on='date', how='inner')

In [32]:
df_inner_2 = pd.merge(df_inner_1, df3, on='date', how='left')

In [33]:
df_inner_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48742 entries, 0 to 48741
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0_x     48742 non-null  int64  
 1   date             48742 non-null  object 
 2   Gross Sales      48742 non-null  int64  
 3   Discounts        48742 non-null  int64  
 4   Net Sales        48742 non-null  int64  
 5   Tax              48742 non-null  int64  
 6   Total Collected  48742 non-null  int64  
 7   Cash             48742 non-null  int64  
 8   Card             48742 non-null  int64  
 9   PayPay           48742 non-null  int64  
 10  Unnamed: 0_y     48742 non-null  int64  
 11  city             48742 non-null  object 
 12  weather1         48742 non-null  object 
 13  max_temp         48742 non-null  object 
 14  min_temp         48742 non-null  object 
 15  rain             48742 non-null  object 
 16  sunrise          48742 non-null  object 
 17  sunset      

In [34]:
#To clean the data 
def clean_data(df):
    df = df.copy()
    df = df.drop(['Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0', 'city'], axis = 1) 
    df[['holidayjp']] = df[['holidayjp']].fillna(value= '平日')
    df[['holiday']] = df[['holiday']].fillna(value= 'workday')
    df["weather1"].replace({"unknown": "stable"}, inplace=True)
    df["date"]= pd.to_datetime(df["date"])
    df = df.rename(columns = {"date": "Date","weather1": "Weather Flow",
                             "max_temp": "Max Temp", "min_temp": "Min Temp",
                             "rain": "Rain","sunrise": "Sunrise","sunset": "Sunset",
                             "holidayjp": "Holiday JP","holiday": "Holiday EN"})
    
    df = df.sort_values(by="Date")
    df = df.reset_index(drop=True)

    return df

In [35]:
df = df_inner_2

In [36]:
cleaned_df = clean_data(df)

In [37]:
cleaned_df.iloc[2300:4350]

Unnamed: 0,Date,Gross Sales,Discounts,Net Sales,Tax,Total Collected,Cash,Card,PayPay,Weather Flow,Max Temp,Min Temp,Rain,Sunrise,Sunset,Holiday JP,Holiday EN
2300,2019-01-12,519,0,519,41,560,560,0,0,stable,4.6℃,-5.0℃,0.0mm,06:59,16:53,平日,workday
2301,2019-01-12,1130,0,1130,90,1220,1220,0,0,stable,4.6℃,-5.0℃,0.0mm,06:59,16:53,平日,workday
2302,2019-01-12,417,0,417,33,450,450,0,0,stable,4.6℃,-5.0℃,0.0mm,06:59,16:53,平日,workday
2303,2019-01-12,2510,0,2510,200,2710,2710,0,0,stable,4.6℃,-5.0℃,0.0mm,06:59,16:53,平日,workday
2304,2019-01-12,334,0,334,26,360,360,0,0,stable,4.6℃,-5.0℃,0.0mm,06:59,16:53,平日,workday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4345,2019-02-01,908,0,908,72,980,980,0,0,stable,2.0℃,-6.8℃,0.0mm,06:50,17:14,平日,workday
4346,2019-02-01,908,0,908,72,980,980,0,0,stable,2.0℃,-6.8℃,0.0mm,06:50,17:14,平日,workday
4347,2019-02-01,908,0,908,72,980,980,0,0,stable,2.0℃,-6.8℃,0.0mm,06:50,17:14,平日,workday
4348,2019-02-01,-908,0,-908,-72,-980,-980,0,0,stable,2.0℃,-6.8℃,0.0mm,06:50,17:14,平日,workday


In [38]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48742 entries, 0 to 48741
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             48742 non-null  datetime64[ns]
 1   Gross Sales      48742 non-null  int64         
 2   Discounts        48742 non-null  int64         
 3   Net Sales        48742 non-null  int64         
 4   Tax              48742 non-null  int64         
 5   Total Collected  48742 non-null  int64         
 6   Cash             48742 non-null  int64         
 7   Card             48742 non-null  int64         
 8   PayPay           48742 non-null  int64         
 9   Weather Flow     48742 non-null  object        
 10  Max Temp         48742 non-null  object        
 11  Min Temp         48742 non-null  object        
 12  Rain             48742 non-null  object        
 13  Sunrise          48742 non-null  object        
 14  Sunset           48742 non-null  objec

In [67]:
cleaned_df.groupby('Date')[['Total Collected']].max() #Top transaction by day

Unnamed: 0_level_0,Total Collected
Date,Unnamed: 1_level_1
2019-01-01,5400
2019-01-02,81930
2019-01-03,3970
2019-01-04,5800
2019-01-05,4300
...,...
2019-12-26,3279
2019-12-27,4251
2019-12-28,3920
2019-12-29,4900


In [71]:
cleaned_df.groupby('Date')[['Total Collected', 'Cash', 'Card', 'PayPay']].sum() 
#Total sells by day and by paymente type

Unnamed: 0_level_0,Total Collected,Cash,Card,PayPay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,264412,252372,12040,0
2019-01-02,409942,401942,8000,0
2019-01-03,282040,271140,10900,0
2019-01-04,231773,216083,15690,0
2019-01-05,211258,201968,9290,0
...,...,...,...,...
2019-12-26,119938,101303,10957,7678
2019-12-27,151058,105405,34739,10914
2019-12-28,202009,150090,34490,17429
2019-12-29,182614,128165,39456,14993


In [69]:
cleaned_df.groupby('Date')[['Total Collected']].mean() #Average sales by day

Unnamed: 0_level_0,Total Collected
Date,Unnamed: 1_level_1
2019-01-01,1061.895582
2019-01-02,1246.024316
2019-01-03,956.067797
2019-01-04,969.761506
2019-01-05,964.648402
...,...
2019-12-26,862.863309
2019-12-27,868.149425
2019-12-28,966.550239
2019-12-29,1055.572254


In [78]:
cleaned_df.groupby('Date')[['Gross Sales']].count() #Total of transactions per day

Unnamed: 0_level_0,Gross Sales
Date,Unnamed: 1_level_1
2019-01-01,249
2019-01-02,329
2019-01-03,295
2019-01-04,239
2019-01-05,219
...,...
2019-12-26,139
2019-12-27,174
2019-12-28,209
2019-12-29,173


In [84]:
cleaned_df.groupby('Holiday EN')[['Total Collected']].sum()
#Total sales by national holidays

Unnamed: 0_level_0,Total Collected
Holiday EN,Unnamed: 1_level_1
Children's Day,94324
Coming of Age Day,89267
Constitution Memorial Day,264990
Culture Day,116619
Foundation Day,159148
Greenery Day,228989
Health and Sports Day,125159
Labour Thanksgiving Day,183192
Marine Day,123517
Mountain Day,219729


In [81]:
#For Oscar :p 
cleaned_df.groupby('Holiday JP')[['Total Collected']].sum() #Total sales by national holidays


Unnamed: 0_level_0,Total Collected
Holiday JP,Unnamed: 1_level_1
(敬老の日,112195
こどもの日,94324
みどりの日,228989
体育の日,125159
元日,264412
勤労感謝の日,183192
天皇誕生日,74746
山の日,219729
平日,41164870
建国記念の日,159148


In [82]:
cleaned_df.groupby('Weather Flow')[['Total Collected']].sum()

Unnamed: 0_level_0,Total Collected
Weather Flow,Unnamed: 1_level_1
rain,3086267
sleet,137764
snow,514265
snow and then rain,86124
stable,39675238


In [88]:
cleaned_df.to_csv('final_set.csv')