In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
# loading the dataset
df = pd.read_csv('gdp_df.csv')
df.head() # display the first 5 rows of the dataset

Unnamed: 0,Country Name,Country Code,Series Name,1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Afghanistan,AFG,GDP (current US$),..,3521418059.92345,2813571753.87253,3825701438.99963,4520946818.54581,5224896718.67782,6203256538.70967,...,20497128555.6972,19134221644.7325,18116572395.0772,18753456497.8159,18053222687.4126,18799444490.1128,19955929052.1496,14266499429.8746,14502158192.0904,..
1,Afghanistan,AFG,"Population, total",19262847,19542982.0,19688632.0,21000256.0,22645130.0,23553551.0,24411191.0,...,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0,40099462.0,41128771.0,42239854
2,Albania,ALB,GDP (current US$),3212121650.97755,3480355258.04122,3922100793.5403,4348068242.19512,5611496257.14231,7184685781.51876,8052077248.14638,...,13228147516.1168,11386853113.0189,11861199830.8396,13019726211.7369,15156424015.1977,15401826127.2539,15162734205.2462,17930565118.8176,18916378860.5488,22977677860.7979
3,Albania,ALB,"Population, total",3108778,3089027.0,3060173.0,3051010.0,3039616.0,3026939.0,3011487.0,...,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0,2837849.0,2811666.0,2777689.0,2745972
4,Algeria,DZA,GDP (current US$),48640671734.9711,54790398570.3282,59413400923.6364,61516103406.1688,73482264190.9245,91913680985.1708,107046618669.707,...,238942664192.59,187493855609.345,180763839522.151,189880896903.073,194554483655.528,193459662090.677,164873415325.201,186265418570.697,225560256621.757,239899491127.742


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Country Name   416 non-null    object
 1   Country Code   416 non-null    object
 2   Series Name    416 non-null    object
 3   1999 [YR1999]  416 non-null    object
 4   2000 [YR2000]  416 non-null    object
 5   2001 [YR2001]  416 non-null    object
 6   2002 [YR2002]  416 non-null    object
 7   2003 [YR2003]  416 non-null    object
 8   2004 [YR2004]  416 non-null    object
 9   2005 [YR2005]  416 non-null    object
 10  2006 [YR2006]  416 non-null    object
 11  2007 [YR2007]  416 non-null    object
 12  2008 [YR2008]  416 non-null    object
 13  2009 [YR2009]  416 non-null    object
 14  2010 [YR2010]  416 non-null    object
 15  2011 [YR2011]  416 non-null    object
 16  2012 [YR2012]  416 non-null    object
 17  2013 [YR2013]  416 non-null    object
 18  2014 [YR2014]  416 non-null   

In [18]:
# Function to clean column names
def clean_column_name(col):
    if '[' in col and 'YR' in col and ']' in col:
        return col.split(' ')[0]
    return col

# Apply the function to all column names
df.columns = [clean_column_name(col) for col in df.columns]

# Display the cleaned DataFrame columns
print(df.columns)

Index(['Country Name', 'Country Code', 'Series Name', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022', '2023'],
      dtype='object')


In [19]:
# Convert all year columns to numeric before melting the DataFrame
years = [str(year) for year in range(1999 , 2024)]
df[years] = df[years].apply(pd.to_numeric, errors='coerce')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 28 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  416 non-null    object 
 1   Country Code  416 non-null    object 
 2   Series Name   416 non-null    object 
 3   1999          405 non-null    float64
 4   2000          406 non-null    float64
 5   2001          407 non-null    float64
 6   2002          410 non-null    float64
 7   2003          410 non-null    float64
 8   2004          410 non-null    float64
 9   2005          410 non-null    float64
 10  2006          411 non-null    float64
 11  2007          411 non-null    float64
 12  2008          413 non-null    float64
 13  2009          413 non-null    float64
 14  2010          413 non-null    float64
 15  2011          414 non-null    float64
 16  2012          413 non-null    float64
 17  2013          413 non-null    float64
 18  2014          413 non-null    

In [21]:
# Melt the DataFrame to convert the year columns into rows 
df = pd.melt(df, id_vars=['Country Name', 'Country Code', 'Series Name'], 
                    var_name='Year', value_name='Value')

In [22]:
df.head()

Unnamed: 0,Country Name,Country Code,Series Name,Year,Value
0,Afghanistan,AFG,GDP (current US$),1999,
1,Afghanistan,AFG,"Population, total",1999,19262850.0
2,Albania,ALB,GDP (current US$),1999,3212122000.0
3,Albania,ALB,"Population, total",1999,3108778.0
4,Algeria,DZA,GDP (current US$),1999,48640670000.0


In [23]:
# Pivot the DataFrame to convert the Series Name column into separate columns 
df = df.pivot(index=['Country Name', 'Country Code', 'Year'], columns='Series Name', values='Value')

# Reset the index of the DataFrame 
df.reset_index(inplace=True)

In [24]:
df.head()

Series Name,Country Name,Country Code,Year,GDP (current US$),"Population, total"
0,Afghanistan,AFG,1999,,19262847.0
1,Afghanistan,AFG,2000,3521418000.0,19542982.0
2,Afghanistan,AFG,2001,2813572000.0,19688632.0
3,Afghanistan,AFG,2002,3825701000.0,21000256.0
4,Afghanistan,AFG,2003,4520947000.0,22645130.0


In [25]:
df.rename(columns={
    'Country Name': 'Country',
    'Country Code': 'Country_Code',
    'Year': 'Year',
    'GDP (current US$)': 'GDP_USD',
    'Population, total': 'Population'
}, inplace=True)

In [26]:
df.head()

Series Name,Country,Country_Code,Year,GDP_USD,Population
0,Afghanistan,AFG,1999,,19262847.0
1,Afghanistan,AFG,2000,3521418000.0,19542982.0
2,Afghanistan,AFG,2001,2813572000.0,19688632.0
3,Afghanistan,AFG,2002,3825701000.0,21000256.0
4,Afghanistan,AFG,2003,4520947000.0,22645130.0


In [27]:
# save the cleaned DataFrame to a new CSV file
df.to_csv('transformed.csv', index=False)