## Loading and Merging Data from CSV files

In [241]:
import pandas as pd

# List of file names for March, April, and May across four years
file_names = [
    'Los_Angeles_January_2024.csv','Los_Angeles_February_2024.csv','Los_Angeles_March_2024.csv',
    'Los_Angeles_April_2024.csv','Los_Angeles_May_2024.csv','Los_Angeles_June_2024.csv'
]
# List to store dataframes
dataframes = []

# Read each CSV file and append to the list
for file_name in file_names:
    df = pd.read_csv(file_name)
    dataframes.append(df)

# Check if dataframes list is not empty
if dataframes:
    # Concatenate all dataframes into a single dataframe
    merged_LA_df = pd.concat(dataframes, ignore_index=True)

## Data Inspection

In [242]:
# Display the first few rows of the DataFrame to get a quick look at the data
print(merged_LA_df.head())

   location_id         location_name parameter  value unit  \
0         7936  Los Angeles - N. Mai        co    0.3  ppm   
1         7936  Los Angeles - N. Mai        co    0.3  ppm   
2         7936  Los Angeles - N. Mai        co    0.5  ppm   
3         7936  Los Angeles - N. Mai        co    0.6  ppm   
4         7936  Los Angeles - N. Mai        co    0.8  ppm   

                 datetimeUtc              datetimeLocal             timezone  \
0  2024-01-01T01:00:00+00:00  2023-12-31T17:00:00-08:00  America/Los_Angeles   
1  2024-01-01T02:00:00+00:00  2023-12-31T18:00:00-08:00  America/Los_Angeles   
2  2024-01-01T03:00:00+00:00  2023-12-31T19:00:00-08:00  America/Los_Angeles   
3  2024-01-01T04:00:00+00:00  2023-12-31T20:00:00-08:00  America/Los_Angeles   
4  2024-01-01T05:00:00+00:00  2023-12-31T21:00:00-08:00  America/Los_Angeles   

    latitude   longitude  country_iso  isMobile  isMonitor  \
0  34.066429 -118.226755          NaN       NaN        NaN   
1  34.066429 -118.2267

In [243]:
# Display a concise summary of the DataFrame
merged_LA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20067 entries, 0 to 20066
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   location_id    20067 non-null  int64  
 1   location_name  20067 non-null  object 
 2   parameter      20067 non-null  object 
 3   value          20067 non-null  float64
 4   unit           20067 non-null  object 
 5   datetimeUtc    20067 non-null  object 
 6   datetimeLocal  20067 non-null  object 
 7   timezone       20067 non-null  object 
 8   latitude       20067 non-null  float64
 9   longitude      20067 non-null  float64
 10  country_iso    0 non-null      float64
 11  isMobile       0 non-null      float64
 12  isMonitor      0 non-null      float64
 13  owner_name     20067 non-null  object 
 14  provider       20067 non-null  object 
dtypes: float64(6), int64(1), object(8)
memory usage: 2.3+ MB


## Data Cleaning

#Handling missing values

In [244]:
# Check for missing values in each column and count them
merged_LA_df.isnull().sum()


location_id          0
location_name        0
parameter            0
value                0
unit                 0
datetimeUtc          0
datetimeLocal        0
timezone             0
latitude             0
longitude            0
country_iso      20067
isMobile         20067
isMonitor        20067
owner_name           0
provider             0
dtype: int64

In [245]:
#Define the columns to drop empty columns from the DataFrame
columns_to_drop=['country_iso','isMobile','isMonitor']
# Drop the specified columns from the DataFrame
merged_LA_df.drop(columns=columns_to_drop,inplace=True)
# Display the first few rows of the DataFrame to get a quick look at the data
print(merged_LA_df.head())

   location_id         location_name parameter  value unit  \
0         7936  Los Angeles - N. Mai        co    0.3  ppm   
1         7936  Los Angeles - N. Mai        co    0.3  ppm   
2         7936  Los Angeles - N. Mai        co    0.5  ppm   
3         7936  Los Angeles - N. Mai        co    0.6  ppm   
4         7936  Los Angeles - N. Mai        co    0.8  ppm   

                 datetimeUtc              datetimeLocal             timezone  \
0  2024-01-01T01:00:00+00:00  2023-12-31T17:00:00-08:00  America/Los_Angeles   
1  2024-01-01T02:00:00+00:00  2023-12-31T18:00:00-08:00  America/Los_Angeles   
2  2024-01-01T03:00:00+00:00  2023-12-31T19:00:00-08:00  America/Los_Angeles   
3  2024-01-01T04:00:00+00:00  2023-12-31T20:00:00-08:00  America/Los_Angeles   
4  2024-01-01T05:00:00+00:00  2023-12-31T21:00:00-08:00  America/Los_Angeles   

    latitude   longitude                         owner_name provider  
0  34.066429 -118.226755  Unknown Governmental Organization   AirNow  
1  3

#Checking for Duplicate Values

In [246]:
#Check for duplicate rows in the dataset
merged_LA_df_dup=merged_LA_df.duplicated().sum()
#printing the value
merged_LA_df_dup

0

#Converting data types for further Analysis

In [247]:
# converting 'datetimeUtc' column in to datetime format
merged_LA_df['datetimeUtc'] = pd.to_datetime(merged_LA_df['datetimeUtc'])


## Data Manuplation

#Pivot the DataFrame to Have the Pollutants as Column

In [248]:
# Pivot the DataFrame with multi-level index including 'location_name' and 'unit'
merged_LA_df_pivot = pd.pivot_table(merged_LA_df, 
                                index=['datetimeUtc','datetimeLocal','location_name', 'unit'], 
                                columns='parameter', 
                                values='value', 
                                aggfunc='mean')

# Reset index to convert the multi-level index back into columns
merged_LA_df_pivot.reset_index(inplace=True)

# Reorder columns as desired
merged_LA_df_pivot = merged_LA_df_pivot[['location_name', 'datetimeUtc','datetimeLocal','co','no2','o3','pm10','pm25', 'unit']]

#Fill missing values in the pivoted DataFrame that happened due to column rearrangement with 0(data cleaning for pivoted data)
merged_LA_df_pivot.fillna(0,inplace=True)

#Create Separate DataFrames for Each Month

In [249]:
import pandas as pd
# Create separate DataFrames for each month
January_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 1]
February_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 2]
March_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 3] 
April_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 4]
May_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 5]
June_LA_df = merged_LA_df_pivot[merged_LA_df_pivot['datetimeUtc'].dt.month == 6]

# Print the first few rows of January DataFrame to verify
print(January_LA_df.head())

parameter         location_name               datetimeUtc  \
0          Los Angeles - N. Mai 2024-01-01 01:00:00+00:00   
1          Los Angeles - N. Mai 2024-01-01 01:00:00+00:00   
2          Los Angeles - N. Mai 2024-01-01 02:00:00+00:00   
3          Los Angeles - N. Mai 2024-01-01 02:00:00+00:00   
4          Los Angeles - N. Mai 2024-01-01 03:00:00+00:00   

parameter              datetimeLocal   co    no2     o3  pm10  pm25   unit  
0          2023-12-31T17:00:00-08:00  0.3  0.007  0.030   0.0   0.0    ppm  
1          2023-12-31T17:00:00-08:00  0.0  0.000  0.000   7.0   3.0  µg/m³  
2          2023-12-31T18:00:00-08:00  0.3  0.016  0.022   0.0   0.0    ppm  
3          2023-12-31T18:00:00-08:00  0.0  0.000  0.000  11.0   8.5  µg/m³  
4          2023-12-31T19:00:00-08:00  0.5  0.021  0.013   0.0   0.0    ppm  
