# Preprocess the gasline data 

### Read the landing data

In [1]:
import os
import pandas as pd

In [2]:
# Read the second sheet named "Data 1" using pandas
df = pd.read_excel("../data/landing/gasoline_data/gasoline_data.xls", sheet_name="Data 1", skiprows=2, header=0)
df.columns = df.columns.str.lower()

#### Select the feature/coloumn of our interest (date and New york's data)

In [3]:
# Select and display only the column of interest
column_name = "weekly new york regular conventional retail gasoline prices  (dollars per gallon)"
# Drop all columns except the specified column
df = df[['date', column_name]]

# rename the column 
df.rename(columns={column_name: 'gasoline_price'}, inplace=True)


# Display the DataFrame with only the selected column
df.head()


Unnamed: 0,date,gasoline_price
0,1990-08-20,
1,1990-08-27,
2,1990-09-03,
3,1990-09-10,
4,1990-09-17,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1722 entries, 0 to 1721
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            1722 non-null   datetime64[ns]
 1   gasoline_price  1211 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 27.0 KB


#### filter the data so it contain only 2018's data

In [5]:
# Filter the DataFrame to include only records from the year 2018
df = df[df['date'].dt.year == 2018]

# Display the DataFrame with only the selected column and records from 2018
df.head()

Unnamed: 0,date,gasoline_price
1428,2018-01-01,2.545
1429,2018-01-08,2.6
1430,2018-01-15,2.628
1431,2018-01-22,2.632
1432,2018-01-29,2.683


In [6]:
len(df)

53

save it to raw data file 

In [7]:
####### downloading the tlc data to the landing folder########
output_relative_dir = '../data/raw/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ['gasoline_data']: # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)


# data output directory is `data/gasoline_data/`
output_dir = output_relative_dir + 'gasoline_data'



# Save the filtered DataFrame to a CSV file in the specified folder
output_file_path = f"{output_dir}/gasoline_data_2018.csv"
df.to_csv(output_file_path, index=False)

#### data filtering/cleaning

In [8]:
df = pd.read_csv("../data/raw/gasoline_data/gasoline_data_2018.csv")

In [9]:
# remove any empty/null 
print(len(df))
df_cleaned = df.dropna()

print(len(df_cleaned))


53
53


In [10]:
df_cleaned = df_cleaned[df_cleaned['date'] != '2018-12-31']

no empty/null value

In [11]:
df_cleaned

Unnamed: 0,date,gasoline_price
0,2018-01-01,2.545
1,2018-01-08,2.6
2,2018-01-15,2.628
3,2018-01-22,2.632
4,2018-01-29,2.683
5,2018-02-05,2.696
6,2018-02-12,2.688
7,2018-02-19,2.662
8,2018-02-26,2.638
9,2018-03-05,2.627


### create a new feature that represents the week number of the year for 2018

In [12]:
# Convert the 'Date' column to datetime format
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])
# Extract ISO year, ISO week number, and ISO weekday
iso_calendar = df_cleaned['date'].dt.isocalendar()
# Create new columns 'iso_year', 'iso_week', and 'iso_weekday'
df_cleaned['week_number'] = iso_calendar['week']


In [13]:
df_cleaned = df_cleaned[['week_number', 'gasoline_price']]

In [14]:
df_cleaned

Unnamed: 0,week_number,gasoline_price
0,1,2.545
1,2,2.6
2,3,2.628
3,4,2.632
4,5,2.683
5,6,2.696
6,7,2.688
7,8,2.662
8,9,2.638
9,10,2.627


save to curate data  

In [15]:
####### downloading the tlc data to the landing folder########
output_relative_dir = '../data/curated/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ['gasoline_data']: # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)


# data output directory is `data/gasoline_data/`
output_dir = output_relative_dir + 'gasoline_data'



# Save the filtered DataFrame to a CSV file in the specified folder
output_file_path = f"{output_dir}/gasoline_data_2018.csv"
df_cleaned.to_csv(output_file_path, index=False)