In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/annual-surface-temperature-change/Annual_Surface_Temperature_Change.csv


### Skill Level
I selected the intermediate skill level because I want to increase my skills in Python, which I am less experienced with, while having advanced data analysis skills in R.

### Dataset
[Annual Surface Temperature Change](https://climatedata.imf.org/datasets/4063314923d74187be9596f10d034914/explore) dataset provided by the International Monetary Fund (IMF)
> Annual estimates of mean surface temperature change measured with respect to a baseline climatology, corresponding to the period 1951-1980.


In [2]:
# Import and preview dataset

data = pd.read_csv('/kaggle/input/annual-surface-temperature-change/Annual_Surface_Temperature_Change.csv')
data.head()

Unnamed: 0,ObjectId,Country,ISO2,ISO3,Indicator,Unit,Source,CTS_Code,CTS_Name,CTS_Full_Descriptor,...,F2013,F2014,F2015,F2016,F2017,F2018,F2019,F2020,F2021,F2022
0,1,"Afghanistan, Islamic Rep. of",AF,AFG,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,1.281,0.456,1.093,1.555,1.54,1.544,0.91,0.498,1.327,2.012
1,2,Albania,AL,ALB,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,1.333,1.198,1.569,1.464,1.121,2.028,1.675,1.498,1.536,1.518
2,3,Algeria,DZ,DZA,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,1.192,1.69,1.121,1.757,1.512,1.21,1.115,1.926,2.33,1.688
3,4,American Samoa,AS,ASM,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,1.257,1.17,1.009,1.539,1.435,1.189,1.539,1.43,1.268,1.256
4,5,"Andorra, Principality of",AD,AND,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.831,1.946,1.69,1.99,1.925,1.919,1.964,2.562,1.533,3.243


## Day 1: Data Cleaning and Transformation
Handle missing values and perform initial data transformations.

In [3]:
data.columns

Index(['ObjectId', 'Country', 'ISO2', 'ISO3', 'Indicator', 'Unit', 'Source',
       'CTS_Code', 'CTS_Name', 'CTS_Full_Descriptor', 'F1961', 'F1962',
       'F1963', 'F1964', 'F1965', 'F1966', 'F1967', 'F1968', 'F1969', 'F1970',
       'F1971', 'F1972', 'F1973', 'F1974', 'F1975', 'F1976', 'F1977', 'F1978',
       'F1979', 'F1980', 'F1981', 'F1982', 'F1983', 'F1984', 'F1985', 'F1986',
       'F1987', 'F1988', 'F1989', 'F1990', 'F1991', 'F1992', 'F1993', 'F1994',
       'F1995', 'F1996', 'F1997', 'F1998', 'F1999', 'F2000', 'F2001', 'F2002',
       'F2003', 'F2004', 'F2005', 'F2006', 'F2007', 'F2008', 'F2009', 'F2010',
       'F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018',
       'F2019', 'F2020', 'F2021', 'F2022'],
      dtype='object')

In [4]:
data.Unit.unique()

array(['Degree Celsius'], dtype=object)

In [5]:
data.Indicator.unique()

array(['Temperature change with respect to a baseline climatology, corresponding to the period 1951-1980'],
      dtype=object)

In [6]:
data.Source.unique()

array(['Food and Agriculture Organization of the United Nations (FAO). 2022. FAOSTAT Climate Change, Climate Indicators, Temperature change. License: CC BY-NC-SA 3.0 IGO. Extracted from:\xa0https://www.fao.org/faostat/en/#data/ET. Accessed on 2023-03-28.'],
      dtype=object)

In [7]:
data.CTS_Code.unique()

array(['ECCS'], dtype=object)

In [8]:
data.CTS_Full_Descriptor.unique()

array(['Environment, Climate Change, Climate Indicators, Surface Temperature Change'],
      dtype=object)

In [9]:
data.CTS_Name.unique()

array(['Surface Temperature Change'], dtype=object)

In [10]:
# Since columns 5-10 are the same for every row, 
# I'm going to remove those columns to make the data easier to view.

data_dropcols = data.drop(data.iloc[:, 2:10],axis = 1)
data_dropcols = data_dropcols.drop(data_dropcols.columns[0], axis=1)
data_dropcols.head()

Unnamed: 0,Country,F1961,F1962,F1963,F1964,F1965,F1966,F1967,F1968,F1969,...,F2013,F2014,F2015,F2016,F2017,F2018,F2019,F2020,F2021,F2022
0,"Afghanistan, Islamic Rep. of",-0.113,-0.164,0.847,-0.764,-0.244,0.226,-0.371,-0.423,-0.539,...,1.281,0.456,1.093,1.555,1.54,1.544,0.91,0.498,1.327,2.012
1,Albania,0.627,0.326,0.075,-0.166,-0.388,0.559,-0.074,0.081,-0.013,...,1.333,1.198,1.569,1.464,1.121,2.028,1.675,1.498,1.536,1.518
2,Algeria,0.164,0.114,0.077,0.25,-0.1,0.433,-0.026,-0.067,0.291,...,1.192,1.69,1.121,1.757,1.512,1.21,1.115,1.926,2.33,1.688
3,American Samoa,0.079,-0.042,0.169,-0.14,-0.562,0.181,-0.368,-0.187,0.132,...,1.257,1.17,1.009,1.539,1.435,1.189,1.539,1.43,1.268,1.256
4,"Andorra, Principality of",0.736,0.112,-0.752,0.308,-0.49,0.415,0.637,0.018,-0.137,...,0.831,1.946,1.69,1.99,1.925,1.919,1.964,2.562,1.533,3.243


In [11]:
data_droppedna = data_dropcols.dropna()
print(data_dropcols.shape)
print(data_droppedna.shape)

# Dropping rows with NAs removes 67 of 225 rows ~ 30%

(225, 63)
(158, 63)


In [12]:
# Replace NAs with zero

data_replaced = data_dropcols.replace(to_replace = np.nan, value = 0)

In [13]:
# Rename columns
data_replaced.columns = data_replaced.columns.str[1:]
data_replaced = data_replaced.rename(columns={'ountry': 'Country'})
data_replaced.columns

Index(['Country', '1961', '1962', '1963', '1964', '1965', '1966', '1967',
       '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976',
       '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994',
       '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022'],
      dtype='object')

## Day 2: Data Cleaning and Transformation
Explore data normalization and scaling.