# Global Land Temperature by Country
* Remove incomplete rows
* Deal with error-prone columns
* Filter only US country
* Drop un-needed columns
* Change to lowercasing
* normalize the data
* save to csv

In [1]:
import pandas as pd

In [2]:
file_path = '../data/globalLandTemp.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


#### Check columns

In [4]:
df.columns

Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'Country'], dtype='object')

#### Data types

In [5]:
df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
Country                           object
dtype: object

#### length of dataset

In [6]:
df.count()

dt                               577462
AverageTemperature               544811
AverageTemperatureUncertainty    545550
Country                          577462
dtype: int64

#### Dropping any rows that are missing 

In [7]:
df = df.dropna()
df.count()

dt                               544811
AverageTemperature               544811
AverageTemperatureUncertainty    544811
Country                          544811
dtype: int64

#### Filtering only United States

In [8]:
df = df[df.Country == 'United States']
df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
554880,1768-09-01,15.420,2.880,United States
554881,1768-10-01,8.162,3.386,United States
554882,1768-11-01,1.591,3.783,United States
554883,1768-12-01,-2.882,4.979,United States
554884,1769-01-01,-3.952,4.856,United States
...,...,...,...,...
557816,2013-05-01,14.073,0.178,United States
557817,2013-06-01,20.198,0.236,United States
557818,2013-07-01,22.074,0.152,United States
557819,2013-08-01,21.168,0.249,United States


#### Change Celcius to Fahrenheit
* function convertion
* apply function to columns AverageTemperature, AverageTemperatureUncertainty


In [9]:
# define function to convert value(s) from Fahrenheit to Celsius
def cel_to_fer(x):
    
    # convert values from Fahrenheit to Celsius using Celsius = ((Fahrenheit - 32) / 1.8)
    # can take single value, single value variable, or numpy array as input
    x = (x * 9/5) + 32
    
    # returns value(s) converted from Fahrenheit to Celsius
    return(x)  

df["AverageTemperature"] = df["AverageTemperature"].apply(cel_to_fer)
df["AverageTemperatureUncertainty"] = df["AverageTemperatureUncertainty"].apply(cel_to_fer)
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
554880,1768-09-01,59.756,37.184,United States
554881,1768-10-01,46.6916,38.0948,United States
554882,1768-11-01,34.8638,38.8094,United States
554883,1768-12-01,26.8124,40.9622,United States
554884,1769-01-01,24.8864,40.7408,United States


#### Convert column dt to datetime

In [10]:
df['dt'] = pd.to_datetime(df['dt'])
df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
Country                                  object
dtype: object

#### Using .dt to extract year only

In [11]:
df['dt'] = df['dt'].dt.year
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
554880,1768,59.756,37.184,United States
554881,1768,46.6916,38.0948,United States
554882,1768,34.8638,38.8094,United States
554883,1768,26.8124,40.9622,United States
554884,1769,24.8864,40.7408,United States


#### Group by date and get the mean average

In [12]:
group_df = df.groupby(df['dt'])
group_df = group_df.mean()
group_df.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty
dt,Unnamed: 1_level_1,Unnamed: 2_level_1
1768,42.03095,38.7626
1769,50.8037,38.648075
1774,34.8854,38.5388
1775,49.0985,36.95915
1776,46.598,37.331764


In [13]:
group_df.count()

AverageTemperature               215
AverageTemperatureUncertainty    215
dtype: int64

#### Reset Index to name columns

In [14]:
group_df.reset_index(level=['dt'], inplace=True)
group_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty
0,1768,42.03095,38.7626
1,1769,50.8037,38.648075
2,1774,34.8854,38.5388
3,1775,49.0985,36.95915
4,1776,46.598,37.331764


#### lowercase/rename columns

In [15]:
group_df = group_df.rename(columns={'dt':'year','AverageTemperature':'avg_temp','AverageTemperatureUncertainty':'avg_temp_uncert'})
group_df.head()

Unnamed: 0,year,avg_temp,avg_temp_uncert
0,1768,42.03095,38.7626
1,1769,50.8037,38.648075
2,1774,34.8854,38.5388
3,1775,49.0985,36.95915
4,1776,46.598,37.331764


#### Exporting to a csv file

In [16]:
group_df.to_csv('../data_transformed/us_land_temp.csv')