In [None]:
# Load CSV
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

df = pd.read_csv('./HW3/EuCitiesTemperatures.csv')

# Fill missing latitude and longitude with country averages (rounded to 2 decimals)
df['latitude'] = df.groupby('country')['latitude'].transform(lambda x: x.fillna(round(x.mean(), 2)))
df['longitude'] = df.groupby('country')['longitude'].transform(lambda x: x.fillna(round(x.mean(), 2)))

# Filter cities between latitude 40-60 and longitude 15-30
filtered_df = df[(df['latitude'] >= 40) & (df['latitude'] <= 60) &
                 (df['longitude'] >= 15) & (df['longitude'] <= 30)]
country_counts = filtered_df['country'].value_counts()
max_cities = country_counts[country_counts == country_counts.max()]
print("Countries with most cities in lat[40~60] & lon[15~30]:")
print(max_cities)

# Create region_type and fill missing temperature values
df['region_type'] = df['EU'] + '_' + df['coastline']
region_avg_temp = df.groupby('region_type')['temperature'].mean()
df['temperature'] = df.apply(lambda row: region_avg_temp[row['region_type']] if pd.isna(row['temperature']) else row['temperature'], axis=1)

# [Visualization 1] Bar chart - number of cities by region type
region_counts = df['region_type'].value_counts()
plt.figure(figsize=(8,5))
plt.bar(region_counts.index, region_counts.values)
plt.xlabel('Region Type')
plt.ylabel('Number of Cities')
plt.title('Number of Cities by Region Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# [Visualization 2] Scatter plot of lat vs lon, colored by country
countries = df['country'].unique()
colors = cm.rainbow(np.linspace(0, 1, len(countries)))
color_map = dict(zip(countries, colors))
plt.figure(figsize=(8,6))
for country in countries:
    subset = df[df['country'] == country]
    plt.scatter(subset['longitude'], subset['latitude'], label=country, color=color_map[country], s=10)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("City Locations by Country")
plt.tight_layout()
plt.show()

# [Visualization 3] Histogram - population group (5 bins) vs countries
bins = pd.cut(df['population'], bins=5)
bin_counts = df.groupby(bins)['country'].nunique()
bin_counts.plot(kind='bar', figsize=(8,5))
plt.xlabel('Population Group')
plt.ylabel('Number of Countries')
plt.title('Countries by Population Group (5 Bins)')
plt.tight_layout()
plt.show()

# [Visualization 4] Subplots by region type, color by temperature
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
region_types = df['region_type'].unique()
temp_color = lambda t: 'red' if t > 10 else 'blue' if t < 6 else 'orange'
for i, region in enumerate(region_types):
    subset = df[df['region_type'] == region].reset_index(drop=True)
    ax = axes[i // 2, i % 2]
    colors = subset['temperature'].apply(temp_color)
    ax.scatter(subset.index, subset['latitude'], c=colors)
    ax.set_title(f'Region: {region}')
    ax.set_xlabel('City Index')
    ax.set_ylabel('Latitude')
    ax.set_xticks(range(len(subset)))
    ax.grid(True)
plt.tight_layout()
plt.show()


In [2]:
import os
os.listdir()

['.npm',
 '.config',
 '.cache',
 '.virtual_documents',
 'HW3',
 '.ipython',
 '.jupyter',
 '.local',
 '.ipynb_checkpoints',
 'shared']

In [3]:
import os
print(os.getcwd())

/home/jk2065


In [5]:
os.listdir('HW3')

['eucities_temps.ipynb',
 'GermanCredit.csv',
 'EuCitiesTemperatures.csv',
 'google_apps.ipynb',
 '.ipynb_checkpoints',
 'GooglePlaystore.xlsx',
 'german_credit.ipynb']