In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# Figure 4

In [None]:
df2 = pd.read_csv("machine_readable_review.csv")

In [None]:
num_bins = 50
fig, ax = plt.subplots(1,2, figsize=(6,2.75), dpi=200)
sns.histplot(ax=ax[0],
              data=df2,
                x='Latitude (N)',
                  # y='Vegetation community https://nsidc.org/data/ggd639/versions/1 ',
                  y='Sarah Koppen',
                  # hue='Latitude (N)',
                  cmap='cool',
                  bins=num_bins,
                  cbar=True),
sns.histplot(ax=ax[1],
              data=df2,
                x='Latitude (N)',
                  # y='Vegetation community https://nsidc.org/data/ggd639/versions/1 ',
                  y='Sarah CAVM veg community',
                  # hue='Latitude (N)',
                  cmap='cool',
                  bins=num_bins,
                  cbar=True),
ax[0].set_xlim(30,90)
ax[1].set_xlim(30,90)
ax[0].set_ylabel("Köppen climate classification")
ax[1].set_ylabel("CAVM classification")
fig.tight_layout()

# Figure 1

In [None]:
gdf = gpd.GeoDataFrame(
    df2, geometry=gpd.points_from_xy(df2['Longitude (E)'], df2['Latitude (N)']), crs='epsg:4326')

antarctica = gpd.read_file('geoBoundaries-ATA-ADM0_simplified.geojson')
arctic = gpd.read_file('Major_Ocean_Currents.geojson')


In [None]:
arctic_buffers = gpd.GeoDataFrame(
    gdf.loc[gdf['Latitude (N)'] > 0].dropna(subset='Latitude (N)'),
    geometry = gdf.loc[gdf['Latitude (N)'] > 0].dropna(subset='Latitude (N)').to_crs("EPSG:3411").buffer(100000).to_crs("EPSG:4326"),
)
antarctic_buffers = gpd.GeoDataFrame(
    gdf.loc[gdf['Latitude (N)'] < 0].dropna(subset='Latitude (N)'),
    geometry = gdf.loc[gdf['Latitude (N)'] < 0].dropna(subset='Latitude (N)').to_crs("EPSG:3031").buffer(100000).to_crs("EPSG:4326")
)

In [None]:
arctic_overlaps = gpd.GeoDataFrame(
    arctic_buffers.join(
    gpd.sjoin(gdf.loc[gdf['Latitude (N)'] > 0].dropna(subset='Latitude (N)'), arctic_buffers).groupby("index_right").size().rename("points"),
    how="left",
),
geometry = gdf.loc[gdf['Latitude (N)'] > 0].dropna(subset='Latitude (N)').geometry
)


antarctic_overlaps = gpd.GeoDataFrame(
    antarctic_buffers.join(
    gpd.sjoin(gdf.loc[gdf['Latitude (N)'] < 0].dropna(subset='Latitude (N)'), antarctic_buffers).groupby("index_right").size().rename("points"),
    how="left",
),
geometry = gdf.loc[gdf['Latitude (N)'] < 0].dropna(subset='Latitude (N)').geometry
)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(7,7), dpi=200)

antarctica.to_crs('EPSG:3031').plot(
    # z=0,
                 ax=ax[1],
                 color='gray',
                 zorder=0
                 )

antarctic_overlaps.to_crs('EPSG:3031').plot(markersize=antarctic_overlaps['points']*3,
                                         edgecolor='k', linewidth=0.5,
                     ax=ax[1]
                     )

arctic.loc[arctic['COMBO'] =='o'].to_crs('EPSG:3411').plot(
    # z=0,
                 ax=ax[0],
                 color='w',
                 zorder=0
                 )

ax[0].set_facecolor('gray')

arctic_overlaps.to_crs('EPSG:3411').plot(markersize=arctic_overlaps['points']*3,
                                         edgecolor='k', linewidth=0.5,
                     ax=ax[0]
                     )

ax[0].set_ylim(
    -2e6, 5.5e6
      )
ax[0].set_xlim(
    -5e6, 5e6
      )

# Figure 2

In [None]:
df2['defs'] = df2['defs'].apply(lambda x: str(x).lower())
df2.columns
len(df2)

In [None]:
words = pd.read_csv("word_cloud_counts.csv")
word_list = words['Text'].tolist()
word_list = [s.lower() for s in word_list]
word_list.remove('water')
# word_list

In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
subregion_counts = df2.groupby('region_code')['defs'].apply(lambda x: ' '.join(x).split()).apply(Counter)
subregion_counts

In [None]:
# Initialize an empty DataFrame to store the counts
count_df = pd.DataFrame()

# Count occurrences of each word in each subregion
for subregion, group in df2.groupby('region_code'):
    # Create a Counter for all words in this subregion
    word_counts = Counter(' '.join(group['defs']).split())
    # Convert Counter to DataFrame
    subregion_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=[subregion])
    # Merge with the main count DataFrame
    if count_df.empty:
        count_df = subregion_df
    else:
        count_df = count_df.join(subregion_df, how='outer')

# Replace NaN values with 0
count_df = count_df.fillna(0)

# Filter the counts to include only the specified strings
filtered_counts = count_df.loc[count_df.index.intersection(word_list)]

In [None]:
filtered_counts['sum'] = filtered_counts.sum(axis=1)
filtered_counts['sum'].sort_values(ascending=False)[0:19]

In [None]:
filtered_counts.loc[filtered_counts['sum'] > 4].sort_values('sum', ascending=False).drop('sum', axis=1).plot(kind='bar', stacked=True)

plt.xlabel('Word in definition')
plt.ylabel('Frequency')
plt.title('Frequency of word in definition by region')
plt.legend(title='Region')
# plt.ylim(0,22)