In [1]:
import pandas as pd

In [2]:
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 10)

In [3]:
# import the land temperature data
landtemps = pd.read_csv('data/landtempssample.csv',
    names=['stationid','year','month','avgtemp','latitude',
      'longitude','elevation','station','countryid','country'],
    skiprows=1,
    parse_dates=[['month','year']],
    low_memory=False)

In [4]:
type(landtemps)

pandas.core.frame.DataFrame

In [5]:
# show enough data to get a sense of how the import went
landtemps.head(7)

Unnamed: 0,month_year,stationid,avgtemp,latitude,longitude,elevation,station,countryid,country
0,2000-04-01,USS0010K01S,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.37,-91.124,61.0,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.283,140.6,65.5,BERRI_IRRIGATION,AS,Australia
4,2001-11-01,ASN00028007,,-14.78,143.504,79.4,MUSGRAVE,AS,Australia
5,1991-04-01,USW00024151,5.59,42.149,-112.287,1362.5,MALAD_CITY,US,United States
6,1993-12-01,RSM00022641,-10.17,63.9,38.117,13.0,ONEGA,RS,Russia


In [6]:
landtemps.dtypes

month_year    datetime64[ns]
stationid             object
avgtemp              float64
latitude             float64
longitude            float64
elevation            float64
station               object
countryid             object
country               object
dtype: object

In [7]:
landtemps.shape

(100000, 9)

In [8]:
# fix the column name for the date
landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)

In [9]:
landtemps.dtypes

measuredate    datetime64[ns]
stationid              object
avgtemp               float64
latitude              float64
longitude             float64
elevation             float64
station                object
countryid              object
country                object
dtype: object

In [10]:
landtemps.avgtemp.describe()

count   85,554.000
mean        10.921
std         11.522
min        -70.700
25%          3.460
50%         12.220
75%         19.570
max         39.950
Name: avgtemp, dtype: float64

In [11]:
landtemps.isnull().sum()

measuredate        0
stationid          0
avgtemp        14446
latitude           0
longitude          0
elevation          0
station            0
countryid          0
country            5
dtype: int64

In [12]:
# remove rows with missing values
landtemps.dropna(subset=['avgtemp'], inplace=True)

In [13]:
landtemps.shape

(85554, 9)