In [11]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk

In [12]:
# Store filepath in a variable: use parse_dates=["date"] or dtype=object
# drop the NaN values and note the details of the dropped data.

# 1377884570_tweet_global_warming.csv
# 
gw_1 = "./data/source/GlobalLandTemperatures_GlobalLandTemperaturesByCountry.csv"
# count 544811 avg temp 545550 avg temp uncertain
# Read our Data file with the pandas library
gw_1_df = pd.read_csv(gw_1,parse_dates=["dt"])
gw_1_df.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,544811.0,545550.0
mean,17.193354,1.019057
std,10.953966,1.20193
min,-37.658,0.052
25%,10.025,0.323
50%,20.901,0.571
75%,25.814,1.206
max,38.842,15.003


In [13]:
# drop rows with NaN
gw_1_df= gw_1_df.dropna(axis=0)
gw_1_df.describe()
gw_1_df.info()
print(len(gw_1_df.index))
gw_1_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 544811 entries, 0 to 577460
Data columns (total 4 columns):
dt                               544811 non-null datetime64[ns]
AverageTemperature               544811 non-null float64
AverageTemperatureUncertainty    544811 non-null float64
Country                          544811 non-null object
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 20.8+ MB
544811


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
5,1744-04-01,1.53,4.68,Åland
6,1744-05-01,6.702,1.789,Åland
7,1744-06-01,11.609,1.577,Åland
8,1744-07-01,15.342,1.41,Åland


In [14]:
# Remove these Åland characters
gw_1_df = gw_1_df.loc[gw_1_df["Country"] != "Åland" ]
print(len(gw_1_df.index))
gw_1_df.head()

541645


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
3239,1838-04-01,13.008,2.586,Afghanistan
3241,1838-06-01,23.95,2.51,Afghanistan
3242,1838-07-01,26.877,2.883,Afghanistan
3243,1838-08-01,24.938,2.992,Afghanistan
3244,1838-09-01,18.981,2.538,Afghanistan


In [15]:
# Rename columns as short column names
gw_1_df = gw_1_df.rename(columns={'dt':'date',
                  'AverageTemperature':'avg_temp_c',
                  'AverageTemperatureUncertainty':'temp_uncertainty',
                  'Country': 'country'})

In [16]:
# Reset the index to break the connection to the old data frame
gw_1_df = gw_1_df.sort_values(["date","country"], ascending=[True,True]).\
                                        reset_index(drop=True)
gw_1_df.head(5)    

Unnamed: 0,date,avg_temp_c,temp_uncertainty,country
0,1743-11-01,8.62,2.268,Albania
1,1743-11-01,7.556,2.188,Andorra
2,1743-11-01,2.482,2.116,Austria
3,1743-11-01,0.767,2.465,Belarus
4,1743-11-01,7.106,1.855,Belgium


In [17]:
# Convert Celsius to Fahrenheit and reorder columns
gw_1_df["avg_temp_f"] = gw_1_df["avg_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_1_df = gw_1_df[["date","avg_temp_f","avg_temp_c",
                   "temp_uncertainty","country"]]
print(len(gw_1_df.index))

541645


In [18]:
# drop data for 1743 because there is only one month
# mask = (gw_1_df["date"] > "1743-12-31")\
#         & (gw_1_df['date'] < "2013-01-01")
# gw_1_df = gw_1_df.loc[mask]
gw_1_df = gw_1_df.loc[gw_1_df["date"]>= "1743-12-31"] 
print(len(gw_1_df.index))

541596


In [19]:
mask = (gw_1_df["date"] > "1743-12-31")\
        & (gw_1_df['date'] < "2013-01-01")
gw_1_df = gw_1_df.loc[mask]
print(len(gw_1_df.index))

539647


In [20]:
# Print out the min and max dates
gw1_min_date = gw_1_df["date"].min()
gw1_max_date = gw_1_df["date"].max()
print("min date " + str(gw1_min_date) + " max date " + str(gw1_max_date))
print(len(gw_1_df.index))

min date 1744-04-01 00:00:00 max date 2012-12-01 00:00:00
539647


In [21]:
gw_1_df.info()
gw_1_df.head(12)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539647 entries, 49 to 539695
Data columns (total 5 columns):
date                539647 non-null datetime64[ns]
avg_temp_f          539647 non-null float64
avg_temp_c          539647 non-null float64
temp_uncertainty    539647 non-null float64
country             539647 non-null object
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 24.7+ MB


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,country
49,1744-04-01,56.435,13.575,2.355,Albania
50,1744-04-01,52.0502,11.139,2.315,Andorra
51,1744-04-01,45.7682,7.649,2.547,Austria
52,1744-04-01,45.9932,7.774,3.281,Belarus
53,1744-04-01,48.4862,9.159,2.577,Belgium
54,1744-04-01,53.5838,11.991,2.557,Bosnia And Herzegovina
55,1744-04-01,53.735,12.075,2.667,Bulgaria
56,1744-04-01,55.157,12.865,2.526,Croatia
57,1744-04-01,48.1244,8.958,2.707,Czech Republic
58,1744-04-01,42.8468,6.026,3.472,Denmark (Europe)


In [22]:
# Reset the index to break the connection to the old data frame
gw_1_df = gw_1_df.sort_values(["date","country"], ascending=[True,True]).\
                                        reset_index(drop=True)

In [23]:
# Export file as an XLSX or CSV, w/o index, w/ header
gw_1_df.to_csv("./output/clean_GlobalLandTemperaturesByCountry.csv", index=False, header=True)

In [24]:
# create a year column for grouping
gw_1_df["year"] = gw_1_df["date"].dt.year
print(len(gw_1_df.index))
gw_1_df.head(5)

539647


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,country,year
0,1744-04-01,56.435,13.575,2.355,Albania,1744
1,1744-04-01,52.0502,11.139,2.315,Andorra,1744
2,1744-04-01,45.7682,7.649,2.547,Austria,1744
3,1744-04-01,45.9932,7.774,3.281,Belarus,1744
4,1744-04-01,48.4862,9.159,2.577,Belgium,1744


In [25]:
# create average yearly temperatures
gw_1avg_df = gw_1_df.groupby(["year","country"],\
       as_index=False)["avg_temp_f","avg_temp_c","temp_uncertainty"].\
       mean().rename(columns={"avg_temp_f": "avg_yly_tmp_f",
                       "avg_temp_c": "avg_yly_tmp_c",
                       "temp_uncertainty": "avg_yly_tmp_uncertainty"})
print(len(gw_1avg_df.index))
gw_1avg_df.head()

45358


Unnamed: 0,year,country,avg_yly_tmp_f,avg_yly_tmp_c,avg_yly_tmp_uncertainty
0,1744,Albania,57.311375,14.061875,2.243
1,1744,Andorra,53.89295,12.16275,2.176625
2,1744,Austria,46.01255,7.78475,2.08025
3,1744,Belarus,46.579325,8.099625,2.41775
4,1744,Belgium,51.2258,10.681,1.846375


In [26]:
# Reset the index to break the connection to the old data frame
gw_1avg_df = gw_1avg_df.sort_values(["year","country"],\
                                    ascending=[True,True]).\
                                    reset_index(drop=True)

In [27]:
# Export file as an XLSX or CSV, w/o index, w/ header
gw_1avg_df.to_csv("./output/clean_YearLandAvgTempsByCountry.csv", index=False, header=True)

In [28]:
# Store filepath in a variable: use parse_dates=["date"] 
# drop the NaN values and note the details of the dropped data.

# 
gw_2 = "./data/source/GlobalLandTemperatures_GlobalLandTemperaturesByState.csv"
# Read our Data file with the pandas library
gw_2_df = pd.read_csv(gw_2,parse_dates=["dt"])
gw_2_df.info()
gw_2_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645675 entries, 0 to 645674
Data columns (total 5 columns):
dt                               645675 non-null datetime64[ns]
AverageTemperature               620027 non-null float64
AverageTemperatureUncertainty    620027 non-null float64
State                            645675 non-null object
Country                          645675 non-null object
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 24.6+ MB


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [29]:
# Drop rows with NaN
gw_2_df= gw_2_df.dropna(axis=0)
gw_2_df.describe()
gw_2_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 620027 entries, 0 to 645673
Data columns (total 5 columns):
dt                               620027 non-null datetime64[ns]
AverageTemperature               620027 non-null float64
AverageTemperatureUncertainty    620027 non-null float64
State                            620027 non-null object
Country                          620027 non-null object
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 28.4+ MB


In [30]:
# Rename columns as short column names
gw_2_df = gw_2_df.rename(columns={'dt':'date',
                 'AverageTemperature':'avg_temp_c',
                 'AverageTemperatureUncertainty':'temp_uncertainty',
                 'State': 'state',
                 'Country': 'country'})

In [31]:
# Reset the index to break the connection to the old data frame
gw_2_df = gw_2_df.sort_values(["date"], ascending=[True]).\
                              reset_index(drop=True)

In [32]:
# Convert Celsius to Fahrenheit and reorder columns
gw_2_df["avg_temp_f"] = gw_2_df["avg_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_2_df = gw_2_df[["date","avg_temp_f","avg_temp_c",
                   "temp_uncertainty","state","country"]]
print(len(gw_2_df.index))
gw_2_df.head(5)

620027


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,state,country
0,1743-11-01,30.5042,-0.831,3.427,Wisconsin,United States
1,1743-11-01,42.7154,5.953,2.22,Maryland,United States
2,1743-11-01,28.2056,-2.108,2.507,Novgorod,Russia
3,1743-11-01,21.7256,-5.708,4.134,Tatarstan,Russia
4,1743-11-01,20.8238,-6.209,3.532,Murmansk,Russia


In [33]:
mask = (gw_2_df["date"] > "1743-12-31")\
        & (gw_2_df['date'] < "2013-01-01")
gw_2_df = gw_2_df.loc[mask]
print(len(gw_2_df.index))

617959


In [34]:
# Reset the index to break the connection to the old data frame
gw2_min_date = gw_2_df["date"].min()
gw2_max_date = gw_2_df["date"].max()
print("min date " + str(gw2_min_date) + " max date " + \
      str(gw2_max_date))
print(len(gw_2_df.index))

min date 1744-04-01 00:00:00 max date 2012-12-01 00:00:00
617959


In [35]:
# Reset the index to break the connection to the old data frame
gw_2_df = gw_2_df.sort_values(["date","country","state"],\
                               ascending=[True,True,True]).\
                               reset_index(drop=True)
print(len(gw_2_df.index))    

617959


In [36]:
gw_2_df.info()
gw_2_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617959 entries, 0 to 617958
Data columns (total 6 columns):
date                617959 non-null datetime64[ns]
avg_temp_f          617959 non-null float64
avg_temp_c          617959 non-null float64
temp_uncertainty    617959 non-null float64
state               617959 non-null object
country             617959 non-null object
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 28.3+ MB


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,state,country
0,1744-04-01,38.7644,3.758,2.594,New Brunswick,Canada
1,1744-04-01,28.4036,-1.998,3.3,Newfoundland And Labrador,Canada
2,1744-04-01,40.1162,4.509,2.435,Nova Scotia,Canada
3,1744-04-01,33.4904,0.828,3.486,Ontario,Canada
4,1744-04-01,39.3548,4.086,2.575,Prince Edward Island,Canada


In [37]:
# Export file as an XLSX or CSV, w/o index, w/ header
gw_2_df.to_csv("./output/clean_GlobalLandTemperaturesByState.csv", index=False, header=True)

In [38]:
# GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv
# 
gw_3 = "./data/source/GlobalLandTemperatures_GlobalLandTemperaturesByMajorCity.csv"
# Read our Data file with the pandas library
gw_3_df = pd.read_csv(gw_3,parse_dates=["dt"])
gw_3_df.info()
gw_3_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239177 entries, 0 to 239176
Data columns (total 7 columns):
dt                               239177 non-null datetime64[ns]
AverageTemperature               228175 non-null float64
AverageTemperatureUncertainty    228175 non-null float64
City                             239177 non-null object
Country                          239177 non-null object
Latitude                         239177 non-null object
Longitude                        239177 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 12.8+ MB


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [39]:
# Drop rows with NaN
gw_3_df= gw_3_df.dropna(axis=0)
gw_3_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228175 entries, 0 to 239175
Data columns (total 7 columns):
dt                               228175 non-null datetime64[ns]
AverageTemperature               228175 non-null float64
AverageTemperatureUncertainty    228175 non-null float64
City                             228175 non-null object
Country                          228175 non-null object
Latitude                         228175 non-null object
Longitude                        228175 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 13.9+ MB


In [40]:
# Rename columns as short column names
gw_3_df = gw_3_df.rename(columns={'dt':'date',
                 'AverageTemperature':'avg_temp_c',
                 'AverageTemperatureUncertainty':'temp_uncertainty',
                 'City': 'city', 'Country': 'country',
                 'Latitude': 'latitude', 'Longitude': 'longitude'})

In [41]:
# Convert Celsius to Fahrenheit and reorder columns
gw_3_df["avg_temp_f"] = gw_3_df["avg_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_3_df = gw_3_df[["date","avg_temp_f","avg_temp_c",
                   "temp_uncertainty","city","country",\
                   "latitude","longitude"]]
print(len(gw_3_df.index)) 
gw_3_df.head(5)

228175


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,city,country,latitude,longitude
0,1849-01-01,80.0672,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,81.3812,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,82.5818,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,79.052,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,77.7686,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [42]:
# Reset the index to break the connection to the old data frame
gw_3_df = gw_3_df.sort_values(["date"],\
                              ascending=[True]).\
                              reset_index(drop=True)

In [43]:
# drop data for 1743 because there is only one month
gw_3_df = gw_3_df.loc[gw_3_df["date"] >= "1744-01-01"]
print(len(gw_3_df.index))

228162


In [44]:
mask = (gw_3_df["date"] > "1743-12-31")\
        & (gw_3_df['date'] < "2013-01-01")
gw_3_df = gw_3_df.loc[mask]
print(len(gw_3_df.index))

227355


In [45]:
# Get the min and max date range
gw3_min_date = gw_3_df["date"].min()
gw3_max_date = gw_3_df["date"].max()
print("min date " + str(gw3_min_date) + " max date " + \
      str(gw3_max_date))
print(len(gw_3_df.index)) 

min date 1744-04-01 00:00:00 max date 2012-12-01 00:00:00
227355


In [46]:
# Reset the index to break the connection to the old data frame
gw_3_df = gw_3_df.sort_values(["date","country","city"],\
                               ascending=[True,True,True]).\
                               reset_index(drop=True)
print(len(gw_3_df.index)) 

227355


In [47]:
gw_3_df.info()
gw_3_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227355 entries, 0 to 227354
Data columns (total 8 columns):
date                227355 non-null datetime64[ns]
avg_temp_f          227355 non-null float64
avg_temp_c          227355 non-null float64
temp_uncertainty    227355 non-null float64
city                227355 non-null object
country             227355 non-null object
latitude            227355 non-null object
longitude           227355 non-null object
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 13.9+ MB


Unnamed: 0,date,avg_temp_f,avg_temp_c,temp_uncertainty,city,country,latitude,longitude
0,1744-04-01,41.2952,5.164,2.209,Montreal,Canada,45.81N,72.69W
1,1744-04-01,42.6056,5.892,2.249,Toronto,Canada,44.20N,80.50W
2,1744-04-01,50.351,10.195,2.282,Paris,France,49.03N,2.45E
3,1744-04-01,49.1648,9.536,2.761,Berlin,Germany,52.24N,13.14E
4,1744-04-01,53.8772,12.154,2.006,Rome,Italy,42.59N,13.09E


In [48]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
gw_3_df.to_csv("./output/clean_GlobalLandTemperaturesByMajorCity.csv", index=False, header=True)

In [49]:
#GlobalLandTemperatures_GlobalTemperatures.csv
# 
gw_4 = "./data/source/GlobalLandTemperatures_GlobalTemperatures.csv"
# Read our Data file with the pandas library
gw_4_df = pd.read_csv(gw_4,parse_dates=["dt"])
gw_4_df.info()
gw_4_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 9 columns):
dt                                           3192 non-null datetime64[ns]
LandAverageTemperature                       3180 non-null float64
LandAverageTemperatureUncertainty            3180 non-null float64
LandMaxTemperature                           1992 non-null float64
LandMaxTemperatureUncertainty                1992 non-null float64
LandMinTemperature                           1992 non-null float64
LandMinTemperatureUncertainty                1992 non-null float64
LandAndOceanAverageTemperature               1992 non-null float64
LandAndOceanAverageTemperatureUncertainty    1992 non-null float64
dtypes: datetime64[ns](1), float64(8)
memory usage: 224.5 KB


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [50]:
# Drop rows with NaN
gw_4_df= gw_4_df.dropna(axis=0)
print(len(gw_4_df.index)) 
gw_4_df.describe()

1992


Unnamed: 0,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
count,1992.0,1992.0,1992.0,1992.0,1992.0,1992.0,1992.0,1992.0
mean,8.571583,0.276663,14.350601,0.479782,2.743595,0.431849,15.212566,0.128532
std,4.263193,0.22403,4.309579,0.583203,4.155835,0.445838,1.274093,0.073587
min,0.404,0.034,5.9,0.044,-5.407,0.045,12.475,0.042
25%,4.43,0.09975,10.212,0.142,-1.3345,0.155,14.047,0.063
50%,8.8505,0.23,14.76,0.252,2.9495,0.279,15.251,0.122
75%,12.8585,0.34725,18.4515,0.539,6.77875,0.45825,16.39625,0.151
max,15.482,1.492,21.32,4.373,9.715,3.498,17.611,0.457


In [51]:
# Rename columns as short column names
gw_4_df = gw_4_df.rename(columns={'dt':'date',
     'LandAverageTemperature':'land_avg_temp_c',
     'LandAverageTemperatureUncertainty':'land_avg_temp_uncertain',
     'LandMaxTemperature': 'land_max_temp_c',
     'LandMaxTemperatureUncertainty': 'land_max_temp_uncertain',
     'LandMinTemperature': 'land_min_temp_c',
     'LandMinTemperatureUncertainty': 'land_min_temp_uncertain',
     'LandAndOceanAverageTemperature': 'land_ocean_avg_temp_c',
     'LandAndOceanAverageTemperatureUncertainty': 'land_ocean_avg_temp_uncertain'})

In [52]:
# Reset the index to break the connection to the old data frame
gw_4_df = gw_4_df.sort_values(["date"], ascending=[True]).\
                               reset_index(drop=True)
print(len(gw_4_df.index))
gw_4_df.head()

1992


Unnamed: 0,date,land_avg_temp_c,land_avg_temp_uncertain,land_max_temp_c,land_max_temp_uncertain,land_min_temp_c,land_min_temp_uncertain,land_ocean_avg_temp_c,land_ocean_avg_temp_uncertain
0,1850-01-01,0.749,1.105,8.242,1.738,-3.206,2.822,12.833,0.367
1,1850-02-01,3.071,1.275,9.97,3.007,-2.291,1.623,13.588,0.414
2,1850-03-01,4.954,0.955,10.347,2.401,-1.905,1.41,14.043,0.341
3,1850-04-01,7.217,0.665,12.934,1.004,1.018,1.329,14.667,0.267
4,1850-05-01,10.004,0.617,15.655,2.406,3.811,1.347,15.507,0.249


In [53]:
# Convert Celsius to Fahrenheit and reorder columns
gw_4_df["land_avg_temp_f"] = gw_4_df["land_avg_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_4_df["land_max_temp_f"] = gw_4_df["land_max_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_4_df["land_min_temp_f"] = gw_4_df["land_min_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))
gw_4_df["land_ocean_avg_temp_f"] = gw_4_df["land_ocean_avg_temp_c"].\
                        apply(lambda x: ((x * 1.8)+32))

    
gw_4_df = gw_4_df[["date","land_avg_temp_f","land_avg_temp_c",
                   "land_avg_temp_uncertain",
                   "land_max_temp_f","land_max_temp_c",
                   "land_max_temp_uncertain",
                   "land_min_temp_f","land_min_temp_c",
                   "land_min_temp_uncertain",
                   "land_ocean_avg_temp_f","land_ocean_avg_temp_c",
                   "land_ocean_avg_temp_uncertain"]]
print(len(gw_4_df.index))
gw_4_df.head(5)

1992


Unnamed: 0,date,land_avg_temp_f,land_avg_temp_c,land_avg_temp_uncertain,land_max_temp_f,land_max_temp_c,land_max_temp_uncertain,land_min_temp_f,land_min_temp_c,land_min_temp_uncertain,land_ocean_avg_temp_f,land_ocean_avg_temp_c,land_ocean_avg_temp_uncertain
0,1850-01-01,33.3482,0.749,1.105,46.8356,8.242,1.738,26.2292,-3.206,2.822,55.0994,12.833,0.367
1,1850-02-01,37.5278,3.071,1.275,49.946,9.97,3.007,27.8762,-2.291,1.623,56.4584,13.588,0.414
2,1850-03-01,40.9172,4.954,0.955,50.6246,10.347,2.401,28.571,-1.905,1.41,57.2774,14.043,0.341
3,1850-04-01,44.9906,7.217,0.665,55.2812,12.934,1.004,33.8324,1.018,1.329,58.4006,14.667,0.267
4,1850-05-01,50.0072,10.004,0.617,60.179,15.655,2.406,38.8598,3.811,1.347,59.9126,15.507,0.249


In [54]:
# Reset the index to break the connection to the old data frame
gw4_min_date = gw_4_df["date"].min()
gw4_max_date = gw_4_df["date"].max()
print("min date " + str(gw4_min_date) + " max date " + \
      str(gw4_max_date))
print(len(gw_4_df.index))

min date 1850-01-01 00:00:00 max date 2015-12-01 00:00:00
1992


In [55]:
gw_4_df.info()
gw_4_df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1992 entries, 0 to 1991
Data columns (total 13 columns):
date                             1992 non-null datetime64[ns]
land_avg_temp_f                  1992 non-null float64
land_avg_temp_c                  1992 non-null float64
land_avg_temp_uncertain          1992 non-null float64
land_max_temp_f                  1992 non-null float64
land_max_temp_c                  1992 non-null float64
land_max_temp_uncertain          1992 non-null float64
land_min_temp_f                  1992 non-null float64
land_min_temp_c                  1992 non-null float64
land_min_temp_uncertain          1992 non-null float64
land_ocean_avg_temp_f            1992 non-null float64
land_ocean_avg_temp_c            1992 non-null float64
land_ocean_avg_temp_uncertain    1992 non-null float64
dtypes: datetime64[ns](1), float64(12)
memory usage: 202.4 KB


Unnamed: 0,date,land_avg_temp_f,land_avg_temp_c,land_avg_temp_uncertain,land_max_temp_f,land_max_temp_c,land_max_temp_uncertain,land_min_temp_f,land_min_temp_c,land_min_temp_uncertain,land_ocean_avg_temp_f,land_ocean_avg_temp_c,land_ocean_avg_temp_uncertain
0,1850-01-01,33.3482,0.749,1.105,46.8356,8.242,1.738,26.2292,-3.206,2.822,55.0994,12.833,0.367
1,1850-02-01,37.5278,3.071,1.275,49.946,9.97,3.007,27.8762,-2.291,1.623,56.4584,13.588,0.414
2,1850-03-01,40.9172,4.954,0.955,50.6246,10.347,2.401,28.571,-1.905,1.41,57.2774,14.043,0.341
3,1850-04-01,44.9906,7.217,0.665,55.2812,12.934,1.004,33.8324,1.018,1.329,58.4006,14.667,0.267
4,1850-05-01,50.0072,10.004,0.617,60.179,15.655,2.406,38.8598,3.811,1.347,59.9126,15.507,0.249


In [56]:
# For data analysis: Export file as an XLSX or CSV, w/o index, w/ header
gw_4_df.to_csv("./output/clean_GlobalLandTemps.csv", index=False, header=True)