In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# loading the dataset
df = pd.read_csv('original.csv')
df.head() # display the first 5 rows of the dataset

Unnamed: 0,Country Name,Series Name,1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Canada,GDP (current US$),678412196271.118,744773415931.587,738981792355.372,760649334098.005,895540646634.787,1026690238278.25,1173108598778.68,1319264809590.97,...,1805749878439.94,1556508816217.14,1527994741907.43,1649265644244.09,1725329192783.02,1743725183672.52,1655684730000.19,2007472181464.15,2161483369422.01,2140085567791.45
1,Canada,"Population, total",30401286.0,30685730.0,31020855.0,31359199.0,31642461.0,31938807.0,32242732.0,32571193.0,...,35434066.0,35704498.0,36110803.0,36545075.0,37072620.0,37618495.0,38028638.0,38239864,38939056,40097761
2,Canada,Exports of goods and services (current US$),283991384532.544,329258635782.102,310667613636.364,304705919836.87,330149882235.386,383240584166.026,432369202838.752,467052186177.715,...,573083180253.852,495747450357.728,481357649980.237,518766780219.71,557719627178.031,564142104614.779,487990676642.934,626659601646.079,731555074343.765,717707829049.981
3,Canada,Imports of goods and services (current US$),259518745372.552,287155747087.738,268281895661.157,271159115529.217,295296552708.586,337418139892.391,385981185014.029,430736071932.299,...,589434433378.268,534115083556.223,517396012626.09,554922650425.45,591347645767.306,589706748631.384,524572641557.224,626488133458.384,728719235183.076,726063209302.504
4,Canada,"Industry (including construction), value added...",189085279666.151,219708437142.28,210222107438.017,209734913655.77,249922203982.585,291488854727.133,342700115530.616,380575634696.756,...,490043362985.931,380032251381.957,355950967448.067,396282298083.287,424815832585.479,419843824725.777,371727999597.04,..,..,..


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Country Name   180 non-null    object
 1   Series Name    180 non-null    object
 2   1999 [YR1999]  180 non-null    object
 3   2000 [YR2000]  180 non-null    object
 4   2001 [YR2001]  180 non-null    object
 5   2002 [YR2002]  180 non-null    object
 6   2003 [YR2003]  180 non-null    object
 7   2004 [YR2004]  180 non-null    object
 8   2005 [YR2005]  180 non-null    object
 9   2006 [YR2006]  180 non-null    object
 10  2007 [YR2007]  180 non-null    object
 11  2008 [YR2008]  180 non-null    object
 12  2009 [YR2009]  180 non-null    object
 13  2010 [YR2010]  180 non-null    object
 14  2011 [YR2011]  180 non-null    object
 15  2012 [YR2012]  180 non-null    object
 16  2013 [YR2013]  180 non-null    object
 17  2014 [YR2014]  180 non-null    object
 18  2015 [YR2015]  180 non-null   

In [4]:
# Function to clean column names
def clean_column_name(col):
    if '[' in col and 'YR' in col and ']' in col:
        return col.split(' ')[0]
    return col

# Apply the function to all column names
df.columns = [clean_column_name(col) for col in df.columns]

# Display the cleaned DataFrame columns
print(df.columns)

Index(['Country Name', 'Series Name', '1999', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022', '2023'],
      dtype='object')


In [5]:
# Convert all year columns to numeric before melting the DataFrame
years = [str(year) for year in range(1999 , 2024)]
df[years] = df[years].apply(pd.to_numeric, errors='coerce')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  180 non-null    object 
 1   Series Name   180 non-null    object 
 2   1999          147 non-null    float64
 3   2000          149 non-null    float64
 4   2001          150 non-null    float64
 5   2002          150 non-null    float64
 6   2003          150 non-null    float64
 7   2004          151 non-null    float64
 8   2005          156 non-null    float64
 9   2006          152 non-null    float64
 10  2007          166 non-null    float64
 11  2008          169 non-null    float64
 12  2009          169 non-null    float64
 13  2010          171 non-null    float64
 14  2011          170 non-null    float64
 15  2012          173 non-null    float64
 16  2013          175 non-null    float64
 17  2014          172 non-null    float64
 18  2015          171 non-null    

In [7]:
# Melt the DataFrame to convert the year columns into rows 
df = pd.melt(df, id_vars=['Country Name', 'Series Name'], 
                    var_name='Year', value_name='Value')

In [8]:
df.head()

Unnamed: 0,Country Name,Series Name,Year,Value
0,Canada,GDP (current US$),1999,678412200000.0
1,Canada,"Population, total",1999,30401290.0
2,Canada,Exports of goods and services (current US$),1999,283991400000.0
3,Canada,Imports of goods and services (current US$),1999,259518700000.0
4,Canada,"Industry (including construction), value added...",1999,189085300000.0


In [9]:
# Pivot the DataFrame to convert the Series Name column into separate columns 
df = df.pivot(index=['Country Name', 'Year'], columns='Series Name', values='Value')

# Reset the index of the DataFrame 
df.reset_index(inplace=True)

In [10]:
df.head()

Series Name,Country Name,Year,Exports of goods and services (current US$),GDP (current US$),High-technology exports (current US$),Imports of goods and services (current US$),"Industry (including construction), value added (current US$)","Labor force, total",Military expenditure (current USD),"Population, total",Researchers in R&D (per million people),"Unemployment, total (% of total labor force) (national estimate)"
0,Argentina,1999,27862300000.0,283523000000.0,,32762700000.0,74264970000.0,16410324.0,3461731000.0,36653031.0,713.571045,14.05
1,Argentina,2000,31223690000.0,284203800000.0,,33070150000.0,73866930000.0,16588444.0,3266633000.0,37070774.0,716.688354,15.0
2,Argentina,2001,31112420000.0,268696800000.0,,27603880000.0,67828290000.0,16651214.0,3183592000.0,37480493.0,688.242737,17.32
3,Argentina,2002,27736610000.0,97724000000.0,,13065820000.0,29862240000.0,16491672.0,1114172000.0,37885028.0,692.160828,19.59
4,Argentina,2003,33084510000.0,127587000000.0,,18772900000.0,41650890000.0,16951795.0,1374874000.0,38278164.0,718.546326,15.36


In [14]:
df.rename(columns={
    'Country Name': 'Country',
    'Year': 'Year',
    'GDP (current US$)': 'GDP_USD',
    'Exports of goods and services (current US$)': 'Exports_USD',
    'Imports of goods and services (current US$)': 'Imports_USD',
    'Industry (including construction), value added (current US$)': 'Industry_Value_Added_USD',
    'High-technology exports (current US$)': 'High_Tech_Exports_USD',
    'Researchers in R&D (per million people)': 'Researchers_per_Million',
    'Labor force, total': 'Labor_Force',
    'Military expenditure (current USD)': 'Military_Exp_USD',
    'Research and development expenditure (% of GDP)': 'Research_GDP_Rate',
    'Unemployment, total (% of total labor force) (national estimate)': 'Unemployment_Rate',
    'Population, total': 'Population_Total'
}, inplace=True)

In [15]:
df.head()

Series Name,Country,Year,Exports_USD,GDP_USD,High_Tech_Exports_USD,Imports_USD,Industry_Value_Added_USD,Labor_Force,Military_Exp_USD,Population_Total,Researchers_per_Million,Unemployment_Rate
0,Argentina,1999,27862300000.0,283523000000.0,,32762700000.0,74264970000.0,16410324.0,3461731000.0,36653031.0,713.571045,14.05
1,Argentina,2000,31223690000.0,284203800000.0,,33070150000.0,73866930000.0,16588444.0,3266633000.0,37070774.0,716.688354,15.0
2,Argentina,2001,31112420000.0,268696800000.0,,27603880000.0,67828290000.0,16651214.0,3183592000.0,37480493.0,688.242737,17.32
3,Argentina,2002,27736610000.0,97724000000.0,,13065820000.0,29862240000.0,16491672.0,1114172000.0,37885028.0,692.160828,19.59
4,Argentina,2003,33084510000.0,127587000000.0,,18772900000.0,41650890000.0,16951795.0,1374874000.0,38278164.0,718.546326,15.36


In [16]:
# save the cleaned DataFrame to a new CSV file
df.to_csv('transformed.csv', index=False)