In [4]:

import os
import zipfile
import pandas as pd
import geopandas as gpd
import re
from scipy.spatial.distance import pdist
import numpy as np
import plotly.express as px


In [5]:

# Set directory path and check for .zip files in the directory
data_dir = 'data'
zip_files = [f for f in os.listdir(data_dir) if f.endswith('.zip')]

# Unzip all found .zip files
for zip_file in zip_files:
    with zipfile.ZipFile(os.path.join(data_dir, zip_file), 'r') as zip_ref:
        zip_ref.extractall(data_dir)

# Look for .kml files after unzipping
kml_files = [f for f in os.listdir(data_dir) if f.endswith('.kml')]

# Initialize an empty list to store data
data = []

# Iterate over .kml files, reading each into GeoDataFrame and extracting necessary information
for kml_file in kml_files:
    kml_path = os.path.join(data_dir, kml_file)
    try:
        # Read the KML file with geopandas
        gdf = gpd.read_file(kml_path)
        
        # Append each row with the file name to data
        for _, row in gdf.iterrows():
            data.append({'file_name': kml_file, 'geometry': row['geometry']})
    except Exception as e:
        print(f"Could not read {kml_file}: {e}")

# Convert data list to DataFrame
df_kml = pd.DataFrame(data)


In [6]:

# Extract ID and remove unnecessary columns
df_kml['file_id'] = df_kml['file_name'].apply(lambda x: re.search(r'_(\d+)_', x).group(1) if re.search(r'_(\d+)_', x) else None)
df_kml = df_kml.drop(columns=['file_name'])


In [7]:

# Ensure lat and lon columns by calculating centroids
df_kml['lon'] = df_kml['geometry'].apply(lambda geom: geom.centroid.x if geom else None)
df_kml['lat'] = df_kml['geometry'].apply(lambda geom: geom.centroid.y if geom else None)

# Function to calculate average distance within each student's points
def average_distance_within_group(group):
    if len(group) < 2:  # If only one point, average distance is zero
        return 0
    coords = group[['lat', 'lon']].values
    return np.mean(pdist(coords))  # Calculate pairwise distances and take the mean

# Calculate avg_distance for each student
student_avg_dist = df_kml.groupby('Name').apply(average_distance_within_group).reset_index(name='avg_distance')
student_avg_dist['avg_distance'] = pd.to_numeric(student_avg_dist['avg_distance'], errors='coerce').fillna(0).astype(float)

# Sort by avg_distance and divide students into 4 equal groups
student_avg_dist['group'] = pd.qcut(student_avg_dist['avg_distance'], 4, labels=False) + 1
df_merged = df_kml.merge(student_avg_dist[['Name', 'group']], on='Name', how='left')

# Format 'Group' for display with first names and last initials
df_merged['FirstNameFormatted'] = df_merged['Name'].apply(lambda x: f"{x.split(', ')[1]} {x.split(', ')[0][0]}." if ', ' in x else x)
grouped_names = df_merged.groupby('group')['FirstNameFormatted'].unique().apply(lambda names: ', '.join(names)).reset_index()
grouped_names['Group'] = grouped_names['group'].apply(lambda x: f"Group {x}: ") + grouped_names['FirstNameFormatted']
df_merged = df_merged.merge(grouped_names[['group', 'Group']], on='group', how='left')

# Visualize on a Mapbox map
fig = px.scatter_mapbox(
    df_merged, lat="lat", lon="lon", color="group", title="Student Groups Based on Clustering",
    hover_name="Name", hover_data={'Group': True}, mapbox_style="carto-positron", zoom=10
)

# Center on Harrisonburg, VA
fig.update_layout(mapbox=dict(center=dict(lat=38.4496, lon=-78.8689), zoom=10), margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


KeyError: 'Name'