In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plot
import matplotlib.pyplot as plt

# Data Collection

In [2]:

fertility_data = pd.read_csv('Fertility.csv')
life_expectancy_data = pd.read_csv('Life expectancy.csv')
population_growth_data = pd.read_csv('Population growth.csv')
sex_ratio_data = pd.read_csv('Sex-ratio.csv')
suicide_rate_data = pd.read_csv('Suicide rate.csv')
urbanization_rate_data = pd.read_csv('Urbanization rate.csv')
median_age_data = pd.read_csv('Median age.csv')


In [3]:
# List of dataframes to merge
data_frames = [fertility_data, life_expectancy_data, population_growth_data, sex_ratio_data, suicide_rate_data, urbanization_rate_data, median_age_data]

In [4]:
# Initial dataframe to start merging (assuming you start with fertility_data)
combined_data = fertility_data
# List of additional dataframes to merge into the initial one
additional_data_frames = [
    life_expectancy_data, 
    population_growth_data, 
    sex_ratio_data, 
    suicide_rate_data, 
    urbanization_rate_data, 
    median_age_data
]

# Iteratively merge each dataframe
for df in additional_data_frames:
    combined_data = pd.merge(combined_data, df, on=['Country', 'ISO-code'], how='outer')

In [5]:
# Save the combined dataset to a CSV file
combined_data_path = 'combined_world_data.csv'
combined_data.to_csv(combined_data_path, index=False)

# Preprocessing

In [6]:
# Remove missing data
combined_data.dropna(inplace=True)



In [7]:
# Remove duplicate records
combined_data.drop_duplicates(inplace=True)

In [8]:
# Encode categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['Country', 'ISO-code']
for column in categorical_columns:
    combined_data[column] = label_encoder.fit_transform(combined_data[column])

In [9]:
# Scale numerical variables
scaler = StandardScaler()
numerical_columns = ['Fertility', 'Life expectancy', 'Median age', 'Population growth', 'Sex-ratio', 'Suicide rate', 'Urbanization rate']
combined_data[numerical_columns] = scaler.fit_transform(combined_data[numerical_columns])

In [10]:
combined_data.head()

Unnamed: 0,Country,Fertility,ISO-code,Life expectancy,Population growth,Sex-ratio,Suicide rate,Urbanization rate,Median age
0,0,0.726828,0,-0.258807,0.491678,0.411244,-0.706697,-1.296824,0.26873
2,1,-0.40503,45,1.183349,0.005166,0.411244,-1.40595,1.169147,0.353393
4,2,1.4814,1,-0.696182,1.155954,-0.340384,-0.142782,0.812434,-1.12216
5,3,-1.159602,5,1.206991,-0.799449,-0.904105,-2.037534,-1.37954,0.812992
6,4,-0.93323,3,1.159707,-0.939789,-0.058523,-0.097669,2.12038,0.788802


In [11]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 327
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            297 non-null    int32  
 1   Fertility          297 non-null    float64
 2   ISO-code           297 non-null    int32  
 3   Life expectancy    297 non-null    float64
 4   Population growth  297 non-null    float64
 5   Sex-ratio          297 non-null    float64
 6   Suicide rate       297 non-null    float64
 7   Urbanization rate  297 non-null    float64
 8   Median age         297 non-null    float64
dtypes: float64(7), int32(2)
memory usage: 20.9 KB


In [12]:

cols=list(combined_data.columns)
print(cols)



['Country', 'Fertility', 'ISO-code', 'Life expectancy', 'Population growth', 'Sex-ratio', 'Suicide rate', 'Urbanization rate', 'Median age']


In [13]:
# Your current columns
cols = ['Country', 'Fertility', 'ISO-code', 'Life expectancy', 'Population growth', 'Sex-ratio', 'Suicide rate', 'Urbanization rate', 'Median age']

# The new order you want, with 'Fertility' and 'Life expectancy' swapped
new_order = ['Country', 'Life expectancy', 'ISO-code', 'Fertility', 'Population growth', 'Sex-ratio', 'Suicide rate', 'Urbanization rate', 'Median age']

# Reorder the columns in the DataFrame
combined_data = combined_data[new_order]

In [14]:
combined_data.head()

Unnamed: 0,Country,Life expectancy,ISO-code,Fertility,Population growth,Sex-ratio,Suicide rate,Urbanization rate,Median age
0,0,-0.258807,0,0.726828,0.491678,0.411244,-0.706697,-1.296824,0.26873
2,1,1.183349,45,-0.40503,0.005166,0.411244,-1.40595,1.169147,0.353393
4,2,-0.696182,1,1.4814,1.155954,-0.340384,-0.142782,0.812434,-1.12216
5,3,1.206991,5,-1.159602,-0.799449,-0.904105,-2.037534,-1.37954,0.812992
6,4,1.159707,3,-0.93323,-0.939789,-0.058523,-0.097669,2.12038,0.788802


In [15]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 327
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            297 non-null    int32  
 1   Life expectancy    297 non-null    float64
 2   ISO-code           297 non-null    int32  
 3   Fertility          297 non-null    float64
 4   Population growth  297 non-null    float64
 5   Sex-ratio          297 non-null    float64
 6   Suicide rate       297 non-null    float64
 7   Urbanization rate  297 non-null    float64
 8   Median age         297 non-null    float64
dtypes: float64(7), int32(2)
memory usage: 20.9 KB


In [16]:
cols=list(combined_data.columns)
print(cols)


['Country', 'Life expectancy', 'ISO-code', 'Fertility', 'Population growth', 'Sex-ratio', 'Suicide rate', 'Urbanization rate', 'Median age']
