I give a try to create a SQLite database from [Recruit Restaurant Visitor Forecasting](https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/data) and manipulate data wrangling tasks:


In [1]:
# import environment
import seaborn as sns

# Create a new SQLite database

SQLite automatically creates a database if the file does not exist. I create a database file and name it ___visitor_forecasting.db:___

In [2]:
import sqlite3
con = sqlite3.connect('visitor_forecasting.db')  # create a database connection
cur = con.cursor()  # create a cursor

Load CSV data into a sqlite table using __pandas__:

In [3]:
import pandas as pd
air_store_info = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/air_store_info.csv')  # load the data into Pandas 
air_store_info.to_sql('air_store_info', con, if_exists='replace', index = False)  # write to a sqlite database

air_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/air_reserve.csv')
air_reserve.to_sql('air_reserve', con, if_exists='replace', index = False)

hpg_reserve = pd.read_csv('../input/recruit-restaurant-visitor-forecasting-data/hpg_reserve.csv')
hpg_reserve.to_sql('hpg_reserve', con, if_exists='replace', index = False)

In [4]:
pd.read_sql('''SELECT * FROM air_store_info''', con)

In [5]:
pd.read_sql('''SELECT * FROM air_reserve''', con)

In [6]:
pd.read_sql('''SELECT * FROM hpg_reserve''', con)

# Handling missing data

In [7]:
# how many missing values per column
missing_values_count = air_store_info.isnull().sum()
print('The number of missing values per column in air_store_info:\n', missing_values_count)

In [8]:
# how many missing values per column
missing_values_count = air_reserve.isnull().sum()
print('The number of missing values per column in air_reserve:\n', missing_values_count)

In [9]:
# how many missing values per column
missing_values_count = hpg_reserve.isnull().sum()
print('The number of missing values per column in hpg_reserve:\n', missing_values_count)

# Checking data type for each column

In [10]:
air_reserve.head()

In [11]:
air_reserve.dtypes

In [12]:
hpg_reserve.head()

In [13]:
hpg_reserve.dtypes

In [14]:
air_store_info.head()

In [15]:
air_store_info.dtypes

# Parsing dates

The [data type of datetime](http://docs.scipy.org/doc/numpy-1.12.0/reference/generated/numpy.dtype.kind.html#numpy.dtype.kind) in air_reserve and hpg_reserve columns is not M. The datetime format of air_reserve is '%Y/%m/%d %H/%M/%S'. I create a new column with the parsed datetime:

In [16]:
# create a new column (date_parsed) with the parsed dates
air_reserve['reserve_date_parsed'] = pd.to_datetime(air_reserve['reserve_datetime'], format="%Y-%m-%d %H:%M:%S")
air_reserve['visit_date_parsed'] = pd.to_datetime(air_reserve['visit_datetime'], format="%Y-%m-%d %H:%M:%S")
air_reserve['reserve_date_parsed'].dtypes
air_reserve['visit_date_parsed'].dtypes

In [17]:
# create a new column (date_parsed) with the parsed dates
hpg_reserve['date_parsed'] = pd.to_datetime(air_reserve['visit_datetime'], format="%Y-%m-%d %H:%M:%S")
hpg_reserve['date_parsed'].dtypes

In [18]:
#check if parsing dates is working by writing the plot of day from 1 to 31
day_reserve = air_reserve['reserve_date_parsed'].dt.day
day_reserve.head()

#remove na's
day_reserve = day_reserve.dropna()

#plot the day of the month
#sns.histplot(day_of_month_landslides, kde=False, bins=31)
sns.histplot(day_reserve, bins=31)

In [19]:
#check if parsing dates is working by writing the plot of day from 1 to 31
day_visit = air_reserve['visit_date_parsed'].dt.day
day_visit.head()

#remove na's
day_visit = day_visit.dropna()

#plot the day of the month
sns.histplot(day_visit, bins=31)

In [20]:
#check if parsing dates is working by writing the plot of day from 1 to 31
day_hpg = hpg_reserve['date_parsed'].dt.day
day_hpg.head()

#remove na's
day_hpg = day_hpg.dropna()

#plot the day of the month
sns.histplot(day_hpg, bins=31)

# Inconsistent data entry

In [21]:
# get all the unique values in the 'air_genre_name' column
genre_name = air_store_info['air_genre_name'].unique()
genre_name

# Removing duplicates

In [22]:
# Count duplicate in a dataframe
air_store_info.duplicated().sum()

In [23]:
# Count duplicate in a dataframe
hpg_reserve.duplicated().sum()

.loc takes a boolean and filter data based on True and False. The first argument .duplicated() will find the rows that were identified by duplicated(). The second argument : will display all columns.

In [24]:
# Extract duplicate rows
# .loc gets rows (and/or columns) with particular labels with keep='first'(default)
hpg_reserve.loc[hpg_reserve.duplicated(), :]

In [25]:
# Determine which duplicates to mark with keep='False'
hpg_reserve.loc[hpg_reserve.duplicated(keep='last'), :]

In [26]:
# Remove duplicates from the original dataframe
hpg_reserve.drop_duplicates(inplace=True)