# Spatio-Temporal Data
Performed detailed data prepping and processing on image data

In [1]:
!pip install h2o


Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=1af16fdcb90c3ce8bbee40282409f256ab5eee8055ea634b55dd94989aa1fbff
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


## Step 1: EDA

In [4]:
import pandas as pd

# Load the new CSV dataset
pollution_data_path = '/content/pollution_data.csv'
pollution_df = pd.read_csv(pollution_data_path)

# Show the first few rows and summary statistics of the dataset
pollution_df.head(), pollution_df.describe(), pollution_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35596 entries, 0 to 35595
Data columns (total 71 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Date                            35596 non-null  object 
 1   City                            35596 non-null  object 
 2   County                          35596 non-null  object 
 3   State                           35596 non-null  object 
 4   Population Staying at Home      35596 non-null  object 
 5   Population Not Staying at Home  35596 non-null  object 
 6   mil_miles                       35596 non-null  float64
 7   past_week_avg_miles             35596 non-null  float64
 8   latitude                        35596 non-null  float64
 9   longitude                       35596 non-null  float64
 10  o3_min                          33950 non-null  float64
 11  o3_max                          33950 non-null  float64
 12  o3_median                       

(         Date         City      County State Population Staying at Home  \
 0  2019-01-01  albuquerque  bernalillo    NM                   1,77,171   
 1  2019-01-01      atlanta      fulton    GA                   2,83,093   
 2  2019-01-01       austin      travis    TX                   2,68,732   
 3  2019-01-01    baltimore     baltimo    MD                   1,91,486   
 4  2019-01-01        boise         ada    ID                   1,05,237   
 
   Population Not Staying at Home  mil_miles  past_week_avg_miles   latitude  \
 0                       4,99,456  19.038463                  0.0  35.107209   
 1                       7,63,811  29.295609                  0.0  33.760109   
 2                       9,76,194  27.286079                  0.0  30.283333   
 3                       4,09,167  12.961132                  0.0  39.305833   
 4                       3,63,293  12.640954                  0.0  43.604545   
 
     longitude  ...  pm10_max  pm10_median  pm10_variance  p

# Step 2: Data Processing

In [5]:
# Convert the 'Date' column to datetime format
pollution_df['Date'] = pd.to_datetime(pollution_df['Date'])

# Investigate missing values
missing_values_count = pollution_df.isnull().sum()

# Check data types of all columns
data_types = pollution_df.dtypes

missing_values_count, data_types


(Date                             0
 City                             0
 County                           0
 State                            0
 Population Staying at Home       0
                               ... 
 wind-gust_max                 1379
 wind-gust_median              1379
 wind-gust_variance            1379
 wind-gust_count               1379
 pp_feat                          0
 Length: 71, dtype: int64,
 Date                          datetime64[ns]
 City                                  object
 County                                object
 State                                 object
 Population Staying at Home            object
                                    ...      
 wind-gust_max                        float64
 wind-gust_median                     float64
 wind-gust_variance                   float64
 wind-gust_count                      float64
 pp_feat                              float64
 Length: 71, dtype: object)

In [6]:
from sklearn.impute import SimpleImputer

# Explore unique values in the 'Population Staying at Home' column
unique_population_home = pollution_df['Population Staying at Home'].unique()

# For numeric columns with missing values, use median imputation
numeric_cols = pollution_df.select_dtypes(include=['float64']).columns
imputer = SimpleImputer(strategy='median')
pollution_df[numeric_cols] = imputer.fit_transform(pollution_df[numeric_cols])

# Check if missing values are handled
missing_values_after_impute = pollution_df.isnull().sum()

unique_population_home, missing_values_after_impute


(array(['1,77,171', '2,83,093', '2,68,732', ..., '1,59,253', '76,894',
        '3,17,121'], dtype=object),
 Date                          0
 City                          0
 County                        0
 State                         0
 Population Staying at Home    0
                              ..
 wind-gust_max                 0
 wind-gust_median              0
 wind-gust_variance            0
 wind-gust_count               0
 pp_feat                       0
 Length: 71, dtype: int64)

In [7]:
# Convert 'Population Staying at Home' and 'Population Not Staying at Home' to integers
pollution_df['Population Staying at Home'] = pollution_df['Population Staying at Home'].str.replace(',', '').astype(int)
pollution_df['Population Not Staying at Home'] = pollution_df['Population Not Staying at Home'].str.replace(',', '').astype(int)

# Verify the data types again
pollution_df.dtypes


Date                          datetime64[ns]
City                                  object
County                                object
State                                 object
Population Staying at Home             int64
                                   ...      
wind-gust_max                        float64
wind-gust_median                     float64
wind-gust_variance                   float64
wind-gust_count                      float64
pp_feat                              float64
Length: 71, dtype: object

## Step 3: Feature Extraction

In [None]:
# Create a graph from the edge list
G = nx.from_pandas_edgelist(edgelist_df, 'repo_id', 'dev_id', ['isForked', 'isTopContributor'])

# Calculate graph-based features
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Create a DataFrame for these features
features_df = pd.DataFrame({
    'Node': list(degree_centrality.keys()),
    'DegreeCentrality': list(degree_centrality.values()),
    'BetweennessCentrality': list(betweenness_centrality.values()),
    'ClosenessCentrality': list(closeness_centrality.values())
})

# Show the first few rows of the features DataFrame
features_df.head()


Unnamed: 0,Node,DegreeCentrality,BetweennessCentrality,ClosenessCentrality
0,0,0.001693,0.104013,0.155527
1,1,0.000339,0.0,0.1346
2,2,0.001016,0.103095,0.140592
3,3,0.00237,0.358064,0.174993
4,4,0.001355,0.12955,0.130249


## Step 4: Clustering & Anomaly detection

In [9]:
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

from sklearn.preprocessing import StandardScaler

# Select a subset of numerical columns for clustering
numerical_cols = pollution_df.select_dtypes(include=['float64', 'int64']).columns
clustering_data = pollution_df[numerical_cols]

# Standardize the data
scaler = StandardScaler()
clustering_data_scaled = scaler.fit_transform(clustering_data)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(clustering_data_scaled)

# Add cluster labels to the original DataFrame
pollution_df['Cluster'] = clusters

# Show the first few rows with cluster labels
pollution_df.head()




Unnamed: 0,Date,City,County,State,Population Staying at Home,Population Not Staying at Home,mil_miles,past_week_avg_miles,latitude,longitude,...,pm10_median,pm10_variance,pm10_count,wind-gust_min,wind-gust_max,wind-gust_median,wind-gust_variance,wind-gust_count,pp_feat,Cluster
0,2019-01-01,albuquerque,bernalillo,NM,177171,499456,19.038463,0.0,35.107209,-106.617209,...,18.0,5303.29,48.0,0.3,27.3,3.8,839.84,54.0,0.319492,0
1,2019-01-01,atlanta,fulton,GA,283093,763811,29.295609,0.0,33.760109,-84.402826,...,14.0,280.83,23.0,0.1,4.5,1.7,12.11,44.0,23.304216,0
2,2019-01-01,austin,travis,TX,268732,976194,27.286079,0.0,30.283333,-97.750641,...,15.0,422.12,48.0,0.2,14.0,6.8,162.34,67.0,14.333042,0
3,2019-01-01,baltimore,baltimo,MD,191486,409167,12.961132,0.0,39.305833,-76.610417,...,6.0,74.76,24.0,0.1,24.0,8.5,355.46,70.0,6.40324,0
4,2019-01-01,boise,ada,ID,105237,363293,12.640954,0.0,43.604545,-116.202424,...,5.0,170.86,47.0,0.1,11.3,2.2,46.19,92.0,0.0,0


## Model Building & Auto ML

In [11]:
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O cluster
h2o.init()

# Convert DataFrame to H2O Frame
pollution_h2o = h2o.H2OFrame(pollution_df)

# Split data into training and test sets
train, test = pollution_h2o.split_frame([0.8])

# Identify predictors and target
predictors = numerical_cols.tolist()
target = 'TargetColumn'  # Replace with your actual target column

# Run AutoML
aml = H2OAutoML(max_runtime_secs=3600, seed=42)

# View the AutoML Leaderboard
lb = aml.leaderboard
print(lb)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,36 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_unknownUser_nq6c9r
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.158 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
H2OFrame is empty.
