In [19]:
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input

In [20]:
data=pd.read_csv("/main_data.csv")

# Visualizations

In [21]:
data.head()

Unnamed: 0.1,Unnamed: 0,Geographic area,Sex,Year,Current age,Adolescent population (10-19),Adolescent population as proportion of total population (%),Child labour thresholds (economic activity),Labour force unemployment rate,Population annual growth rate,...,Incident Date,Number of Dead,Total Number of Dead and Missing,Country of Origin,Region of Origin,Cause of Death,Migration Route,Location of Incident,Coordinates.1,UNSD Geographical Grouping
0,0,Afghanistan,Female,1980,Total,1380.635,22.497432,11.726087,7.358385,1.01766,...,0,0.0,0.0,0,0,0,0,0,0,0
1,1,Afghanistan,Female,1981,Total,1241.08,22.605534,11.726087,7.358385,1.01766,...,0,0.0,0.0,0,0,0,0,0,0,0
2,2,Afghanistan,Female,1982,Total,1130.494,22.72821,11.726087,7.358385,1.01766,...,0,0.0,0.0,0,0,0,0,0,0,0
3,3,Afghanistan,Female,1983,Total,1123.143,22.845053,11.726087,7.358385,1.01766,...,0,0.0,0.0,0,0,0,0,0,0,0
4,4,Afghanistan,Female,1984,Total,1166.022,22.977602,11.726087,7.358385,1.01766,...,0,0.0,0.0,0,0,0,0,0,0,0


In [22]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47868 entries, 0 to 47867
Data columns (total 38 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Unnamed: 0                                                   47868 non-null  int64  
 1   Geographic area                                              47868 non-null  object 
 2   Sex                                                          47868 non-null  object 
 3   Year                                                         47868 non-null  int64  
 4   Current age                                                  47868 non-null  object 
 5   Adolescent population (10-19)                                47868 non-null  float64
 6   Adolescent population as proportion of total population (%)  47868 non-null  float64
 7   Child labour thresholds (economic activity)                  47868 non-null 

In [23]:
data['Year'].value_counts()

Year
2019    6896
2023    4928
2022    4020
2021    3216
2018    3030
2017    2550
2016    2436
2020    2310
2015    1854
2014     720
2012     486
2006     482
2011     476
2010     476
2005     472
2008     468
2004     466
2013     466
2007     466
2009     466
1981     466
2003     466
1980     466
2001     466
1990     466
1982     466
1983     466
1984     466
1985     466
1986     466
1987     466
1988     466
1989     466
1991     466
2000     466
1992     466
1993     466
1994     466
1995     466
1996     466
1997     466
1998     466
1999     466
2002     466
Name: count, dtype: int64

In [24]:
# Renaming columns for better understanding (if necessary)
data.rename(columns={
    'Geographic area': 'Geographic_Area',
    'Current age': 'Current_Age',
    'Adolescent population (10-19)': 'Adolescent_Population',
    'Adolescent population as proportion of total population (%)': 'Adolescent_Population_Proportion',
    'Child labour thresholds (economic activity)': 'Child_Labour_Thresholds',
    'Labour force unemployment rate': 'Unemployment_Rate',
    'Population annual growth rate': 'Annual_Growth_Rate',
    'Total population': 'Total_Population',
    'Youth population from 15 to 24': 'Youth_Population',
    # Add more renaming if needed
}, inplace=True)

In [25]:
# Drop the 'Unnamed: 0' column
if 'Unnamed: 0' in data.columns:
    data.drop(columns=['Unnamed: 0'], inplace=True)

In [26]:
#-Related Deaths and Missing Persons by Region

# Filter out rows where 'Region of Origin' is 0 and 'Total Number of Dead and Missing' has NaN values
filtered_data = data[(data['Region of Origin'] != '0') & (data['Total Number of Dead and Missing'].notna() & (data['Region of Origin'] != 'Unknown'))]

# Create the scatter chart
fig = px.scatter(filtered_data, x='Region of Origin', y='Total Number of Dead and Missing', color='Year',
                 size='Total Number of Dead and Missing', title='Migrant-Related Deaths and Missing Persons by Region',
                 labels={'Region of Origin': 'Region', 'Total Number of Dead and Missing': 'Deaths and Missing Persons'},
                 size_max=15)

# Update layout for better readability
fig.update_layout(xaxis={'categoryorder':'total descending'}, xaxis_tickangle=-45)

# Show the chart
fig.show()


In [27]:
# Filter out rows where 'Region of Origin' is '0' or 'Unknown', and where 'Total Number of Dead and Missing' is null or below the threshold
threshold = 90
filtered_data = data[(data['Region of Origin'] != '0') &
                     (data['Region of Origin'] != 'Unknown') &
                     (data['Total Number of Dead and Missing'].notna()) &
                     (data['Total Number of Dead and Missing'] >= threshold)]

# Create the heatmap
heatmap = px.density_heatmap(filtered_data, x='Region of Origin', y='Year', z='Total Number of Dead and Missing',
                             color_continuous_scale='Viridis',
                             title='Heatmap of Migrant-Related Deaths and Missing Persons by Region',
                             labels={'Region of Origin': 'Region', 'Total Number of Dead and Missing': 'Deaths and Missing Persons'})

# Update layout for better readability
heatmap.update_layout(xaxis_tickangle=-45)

# Show the heatmap
heatmap.show()

In [28]:

import subprocess
import sys
# Remove the '0' values from 'Migration Route'
filtered_data = data[data['Migration Route'] != '0']

# Calculate the counts for each migration route
migration_route_counts = filtered_data['Migration Route'].value_counts().reset_index()
migration_route_counts.columns = ['Migration Route', 'Count']

# Set a threshold to remove minimal value routes
threshold = 20
filtered_data = migration_route_counts[migration_route_counts['Count'] >= threshold]

# Create a bar chart for migration routes
fig = px.bar(filtered_data, x='Count', y='Migration Route',
             title='Counts of Different Migration Routes',
             labels={'Migration Route': 'Migration Route', 'Count': 'Migrants'},
             color='Count', color_continuous_scale='Viridis',
             orientation='h')  # Use horizontal orientation

# Update layout for better readability
fig.update_layout(xaxis={'categoryorder': 'total descending'}, height=600, width=800)

fig.show()

In [29]:
# Count the values in the 'Migration Route' column
migration_route_counts = data['Migration Route'].value_counts()

# Remove the 'Unknown/Other' category and '0' from the migration route counts
migration_route_counts = migration_route_counts[(migration_route_counts.index != 'Unknown/Other') & (migration_route_counts.index != '0')]

# Create a DataFrame for migration route counts
migration_route_counts_df = migration_route_counts.reset_index()
migration_route_counts_df.columns = ['Migration Route', 'Count']

# Calculate the proportion of each migration route
migration_route_counts_df['Proportion'] = (migration_route_counts_df['Count'] / migration_route_counts_df['Count'].sum()) * 100

# Identify the top 8 migration routes by proportion
top_routes = migration_route_counts_df.nlargest(8, 'Proportion')

# Sum the counts of the other routes
other_count = migration_route_counts_df.loc[~migration_route_counts_df.index.isin(top_routes.index), 'Count'].sum()

# Create a new dataframe for "Others"
other_df = pd.DataFrame([['Others', other_count, (other_count / migration_route_counts_df['Count'].sum()) * 100]], columns=['Migration Route', 'Count', 'Proportion'])

# Concatenate the top routes and the "Others" category
final_routes = pd.concat([top_routes, other_df], ignore_index=True)

# Reverse the Viridis color scale
reversed_viridis = px.colors.sequential.Viridis[::-1]

# Create a pie chart for migration routes
fig = px.pie(final_routes, names='Migration Route', values='Count',
             title='Proportion of Different Migration Routes', color_discrete_sequence=reversed_viridis)

fig.show()

In [30]:
# Filter out rows where 'Region of Origin' is '0' or 'Unknown', 'Mixed or unknown' cause of death, and where 'Total Number of Dead and Missing' is null or below the threshold
threshold = 5
filtered_data = data[(data['Region of Origin'] != '0') &
                     (data['Region of Origin'] != 'Unknown') &
                     (data['Total Number of Dead and Missing'].notna()) &
                     (data['Total Number of Dead and Missing'] >= threshold)]

# Aggregate regions with low counts into "Others"
region_counts = filtered_data['Region of Origin'].value_counts()
top_regions = region_counts.nlargest(8).index  # Keep the top 8 regions
filtered_data['Region of Origin'] = filtered_data['Region of Origin'].apply(lambda x: x if x in top_regions else 'Others')

# Create the bar chart with improved settings
fig = px.bar(filtered_data, x='Region of Origin', y='Total Number of Dead and Missing', color='Cause of Death',
             title='Immigration Deaths by Cause of Death and Region', barmode='group',
             labels={'Region of Origin': 'Region', 'Total Number of Dead and Missing': 'Deaths and Missing Persons'},
             category_orders={"Region of Origin": filtered_data.groupby("Region of Origin")["Total Number of Dead and Missing"].sum().sort_values(ascending=False).index},
             color_discrete_sequence=px.colors.qualitative.Safe)

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45,
                  xaxis_title='Region of Origin',
                  yaxis_title='Total Number of Dead and Missing',
                  legend_title_text='Cause of Death',
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  bargap=0.2)

# Adjust text template and hover information
fig.update_traces(texttemplate='%{y:.2s}', textposition='inside', textfont_size=10,
                  hovertemplate='<b>Region:</b> %{x}<br><b>Total Deaths and Missing:</b> %{y}<br><b>Cause of Death:</b> %{marker.color}')

# Show the chart
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Machine Learning