In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import functions as f
import pandas as pd

url_1 = '../data/raw/SYB67_176_202411_Tourist-Visitors Arrival and Expenditure.csv'
df_1 = pd.read_csv(url_1)

url_2 = '../data/raw/SYB67_285_202411_Research and Development Expenditure and Staff.csv'
df_2 = pd.read_csv(url_2)

url_3 = '../data/raw/SYB67_328_202411_Intentional homicides and other crimes.csv'
df_3 = pd.read_csv(url_3)

url_4 = '../data/raw/SYB67_329_202411_Labour Force and Unemployment.csv'
df_4 = pd.read_csv(url_4)

#Function to clean rows and columns
df_1 = f.clean_row_column(df_1,url_1)
df_2 = f.clean_row_column(df_2,url_2)
df_3 = f.clean_row_column(df_3,url_3)
df_4 = f.clean_row_column(df_4,url_4)

#Drop irrelevant column.
df_1 = df_1.drop(columns='Tourism arrivals series type footnote')

#Function to insert column type
df_1 = f.insert_column(df_1,"Type","Tourism")
df_2 = f.insert_column(df_2,"Type","R&D")
df_3 = f.insert_column(df_3,"Type","Crime")
df_4 = f.insert_column(df_4,"Type","Labour & Unemployment")

#Concat all the dataframes.
df = pd.concat([df_1,df_2,df_3,df_4], axis=0)

#Fill N/A values.
df = df.fillna("N/A")

#Format column names
df.rename(columns={ col: col.rstrip().replace(" ", "_").lower() for col in df.columns}, inplace=True)

#Drop irrelevant columns
df = df.drop(columns=['source','tourism_arrivals_series_type_footnote'])

# #Replace N/A values for tourism_arrivals_series_type column
df.loc[df['type'] == 'Tourism', 'tourism_arrivals_series_type'] = df.loc[df['type'] == 'Tourism', 'tourism_arrivals_series_type'].replace("N/A", "TE")

#Convert the data type of value column to float as well check if value is string and then remove the commas also
df['value'] = df['value'].astype(str).str.replace(',', '').astype(float)

# Apply the function to create a new column 'Continent'
df['continent'] = df['region/country/area'].apply(f.get_continent)

# Display the DataFrame with the new column
df



#### Tourism Analysis

In [None]:
# Size of data for type as Tourism
df_tourism=df[df["type"] == "Tourism"]
display(df_tourism)



In [None]:
# Size of data for type as Tourism and tourism_arrivals_series_type as TE
df_tourism_Exp = df[(df["type"] == "Tourism") & (df["tourism_arrivals_series_type"] == 'TE')]
display(df_tourism_Exp)


###  Series International Tourist Arrivals
TF: International tourist arrivals at frontiers (excluding same-day visitors);
VF: International visitor arrivals at frontiers (tourists and same-day visitors);
THS: International tourist arrivals at hotels and similar establishments;
TCE: International tourist arrivals at collective tourism establishments;
NHS: Nights of international tourists in hotels and similar establishments;
NCE: Nights of international tourists in collective tourism establishments.

In [None]:
# group the data using tourism_arrivals_series_type to see different categories
df_tour=df_tourism.groupby("tourism_arrivals_series_type")["value"].sum()
display(df_tour)
#tf_value = df_tour.loc[df_tour['tourism_arrivals_series_type'] == 'TF', 'value'].sum()
print(f"The value for Tourism Expenditure in all continents is {df_tour['TE']}(USD Million)" )
print(f"The value for Tourists/Visitors Arrival in all continents is {df_tour['TF']}( thousands)" )


In [None]:
#create bar chart for same
import matplotlib.pyplot as plt

# Plotting a bar chart
df_tour.plot(kind='bar', color='purple', figsize=(10, 6))

# Adding labels and title
plt.title('tourism_arrivals_series_type')
plt.xlabel('Tourism Series Type')
plt.ylabel('Count of Values')
plt.xticks(rotation=45)  # Rotate x labels for better visibility

# Show the plot
plt.tight_layout()  # Adjust layout to avoid overlapping labels
plt.show()

In [None]:
# group data by continent and by tourist series
df_grouped = df_tourism.groupby(['continent', 'tourism_arrivals_series_type'])['value'].sum().reset_index()
df_grouped_sorted = df_grouped.sort_values(by='value', ascending=False)

print(df_grouped_sorted)

In [None]:
# Pivot the table to create a matrix for the heatmap
df_pivot = df_grouped.pivot(index='continent', columns='tourism_arrivals_series_type', values='value')

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_pivot, annot=False, fmt=".1f", cmap='Spectral', linewidths=0.3)

# Add title and labels
plt.title('Heatmap of Sum of Values for Tourism Series by continent')
plt.xlabel('Tourism Series Type')
plt.ylabel('Region/Country')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Size of data for type as Tourism and tourism_arrivals_series_type as TE
df_tourism_europe = df_tourism[(df_tourism["continent"] == "Europe")]
display(df_tourism_europe)

In [None]:
df_time_series = df_tourism_europe.groupby(["year", "tourism_arrivals_series_type"])["value"].sum().reset_index()


In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(data=df_time_series, x="year", y="value", hue="tourism_arrivals_series_type", marker="o")

plt.xlabel("Year")
plt.ylabel("Total Tourist Arrivals")
plt.title("Tourism Trends in Europe Over Time")
plt.legend(title="Tourism Series Type")
plt.show()

In [None]:
df_tourism_europe_country=df_tourism_europe.groupby(['region/country/area', 'tourism_arrivals_series_type'])['value'].sum().reset_index()
df_tourism_europe_country_TF = df_tourism_europe_country[df_tourism_europe_country["tourism_arrivals_series_type"] == "TF"]
df_grouped_sorted = df_tourism_europe_country_TF.sort_values(by='value', ascending=False)

print(df_grouped_sorted)

In [None]:
df_pie = df_grouped_sorted.head(10)
# Set figure size
plt.figure(figsize=(10, 10))

# Create pie chart
plt.pie(df_pie["value"], labels=df_pie["region/country/area"], autopct='%1.1f%%', startangle=140, 
        colors=plt.cm.Paired.colors, wedgeprops={'edgecolor': 'black'})

# Set title
plt.title("Tourism Arrivals in European Countries (TF)")

# Show the pie chart
plt.show()