In [1]:
import pandas as pd
import numpy as np

In [2]:
print(pd.__version__)

2.2.3


In [3]:
df = pd.read_csv("EuCitiesTemperatures.csv")

In [4]:
df.head()

Unnamed: 0,city,country,population,EU,coastline,latitude,longitude,temperature
0,Elbasan,Albania,2.9,no,yes,41.12,20.08,15.18
1,Andorra,Andorra,0.07,no,no,42.5,1.52,
2,Innsbruck,Austria,8.57,yes,no,,11.41,4.54
3,Graz,Austria,8.57,yes,no,47.08,,6.91
4,Linz,Austria,8.57,yes,no,48.32,14.29,6.79


In [None]:
df_copy = df.co

In [23]:
# Define additional markers for missing values (update as needed)
na_markers = ["", "NA", "null", "Null","NaN"]

# Load CSV file treating specified strings as NaN
df = pd.read_csv('EuCitiesTemperatures.csv', na_values=na_markers)
print("Initial DataFrame shape:", df.shape)
print("Missing values per column before processing:")
print(df.isna().sum())

# Convert relevant columns to numeric types if necessary
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce')

# Debug: Confirm data types
print("\nData types after conversion:")
print(df.dtypes)

### Task 1: Fill Missing Latitude and Longitude Values
# Group by 'country' and fill missing values with the group's average (rounded to two decimals)

# For latitude
df['latitude'] = df.groupby('country')['latitude'].transform(
    lambda x: x.fillna(round(x.mean(), 2))
)
# For longitude
df['longitude'] = df.groupby('country')['longitude'].transform(
    lambda x: x.fillna(round(x.mean(), 2))
)

print("\nAfter filling missing coordinates:")
print("Missing latitude values:", df['latitude'].isna().sum())
print("Missing longitude values:", df['longitude'].isna().sum())

### Task 2: Filter Cities in the Geographical Band
# Define the boundaries:
#   Latitude: [40, 60], Longitude: [15, 30]
subset_df = df[(df['latitude'] >= 40) & (df['latitude'] <= 60) & 
               (df['longitude'] >= 15) & (df['longitude'] <= 30)]
print("\nNumber of cities in the specified geographical band:", subset_df.shape[0])

# Count the number of cities per country in this subset
city_counts = subset_df['country'].value_counts()
print("\nCity counts per country in the geographical band:")
print(city_counts)

# Identify the country or countries with the maximum number of cities within the band
if not city_counts.empty:
    max_count = city_counts.max()
    countries_with_max = city_counts[city_counts == max_count].index.tolist()
    print("\nCountry/ies with the maximum number of cities in this band:")
    print(countries_with_max, "with", max_count, "cities.")
else:
    print("No cities found in the specified geographical band.")

### Task 3: Fill Missing Temperature Values by Region Type
# Region type is based on the combination of 'EU' (capitalized) and 'coastline'.
# For rows with missing temperature values, fill with the average within the respective group.
#
# Debug: Check group means for temperature by region type before filling missing values
group_temp_means = df.groupby(['EU', 'coastline'])['temperature'].mean()
print("\nGroup temperature means (before filling missing values):")
print(group_temp_means)

df['temperature'] = df.groupby(['EU', 'coastline'])['temperature'].transform(
    lambda x: x.fillna(round(x.mean(), 2))
)

print("\nAfter filling missing temperature values:")
print("Missing temperature values:", df['temperature'].isna().sum())

# Save the processed DataFrame to a new CSV file.
output_file = 'EuCitiesTemperatures_processed.csv'
df.to_csv(output_file, index=False)
print("\nProcessed data saved to:", output_file)

Initial DataFrame shape: (213, 8)
Missing values per column before processing:
city            0
country         0
population      0
EU              0
coastline       0
latitude       62
longitude      62
temperature    64
dtype: int64

Data types after conversion:
city            object
country         object
population     float64
EU              object
coastline       object
latitude       float64
longitude      float64
temperature    float64
dtype: object

After filling missing coordinates:
Missing latitude values: 0
Missing longitude values: 0

Number of cities in the specified geographical band: 63

City counts per country in the geographical band:
country
Romania           9
Poland            9
Belarus           6
Bulgaria          5
Hungary           4
Lithuania         3
Turkey            3
Serbia            3
Moldova           2
Sweden            2
Slovakia          2
Ukraine           2
Latvia            2
Italy             2
Estonia           2
Czech Republic    2
Macedonia

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# ------------------------------------------------------------------------------
# 1. Bar Chart: Number of cities by region type (EU, coastline)
# ------------------------------------------------------------------------------

# Group by region type defined by ('EU', 'coastline') and count cities.
region_counts = df.groupby(['EU', 'coastline']).size()

# Make nicer labels for the bar chart.
bar_labels = [f"EU:{e}, Coast:{c}" for e, c in region_counts.index]

plt.figure()
plt.bar(bar_labels, region_counts.values, color='skyblue')
plt.xlabel('Region Type')
plt.ylabel('Number of Cities')
plt.title('Number of Cities by Region Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# 2. Scatter Plot: Latitude vs Longitude with points colored by country
# ------------------------------------------------------------------------------

plt.figure()

# Get unique countries and assign each a color from a colormap.
unique_countries = df['country'].unique()
# Using tab20 colormap which can display up to 20 distinct colors; if more, adjust accordingly.
cmap = plt.cm.get_cmap('tab20', len(unique_countries))

for idx, country in enumerate(unique_countries):
    subset = df[df['country'] == country]
    plt.scatter(subset['longitude'], subset['latitude'], 
                color=cmap(idx), label=country, alpha=0.7)

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Map-like Scatter Plot of Cities\n(Longitude vs. Latitude, colored by Country)')
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# 3. Histogram: Distribution of countries by population groups (5 bins)
# ------------------------------------------------------------------------------

# Since population values are unique per country, first get unique country-population pairs.
pop_by_country = df[['country', 'population']].drop_duplicates()

plt.figure()
# Plot the histogram with 5 bins; get bin edges to set more descriptive tick labels.
counts, bin_edges, _ = plt.hist(pop_by_country['population'], bins=5, edgecolor='black')
# Compute the centers of the bins for custom tick labels.
bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
# Create labels as range strings.
bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}" for i in range(len(bin_edges)-1)]
plt.xlabel('Population Group')
plt.ylabel('Number of Countries')
plt.title('Histogram of Countries by Population Group')
plt.xticks(bin_centers, bin_labels)
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------------------
# 4. Subplots (2x2): Scatter plots of Latitude vs. City Index for each region type
# ------------------------------------------------------------------------------

# Define the four region types as (EU, coastline) pairs.
region_types = [('yes', 'yes'), ('yes', 'no'), ('no', 'yes'), ('no', 'no')]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, (eu_val, coast_val) in enumerate(region_types):
    ax = axes[i]
    # Filter the DataFrame for the current region.
    region_df = df[(df['EU'] == eu_val) & (df['coastline'] == coast_val)]
    
    # Create a city index (0 to n-1).
    n = len(region_df)
    x = np.arange(n)
    
    # Determine colors based on temperature:
    # 'red' if temperature > 10, 'blue' if temperature < 6, otherwise 'orange'
    color_series = region_df['temperature'].apply(lambda t: 'red' if t > 10 else ('blue' if t < 6 else 'orange'))
    
    ax.scatter(x, region_df['latitude'], c=color_series)
    ax.set_title(f"Region: EU = {eu_val}, Coastline = {coast_val}")
    ax.set_xlabel("City Index")
    ax.set_ylabel("Latitude")
    ax.set_xticks(np.arange(n))  # Tick at every city index
    # Optionally: if there are many cities, you might limit the number of xticks.
    
fig.suptitle("Latitude vs. City Index for Each Region Type (colored by Temperature)", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
