# Pandas III
---

In [1]:
import pandas as pd

*we obtained some data file (non csv files) like from http://www.ncl.ucar.edu/Applications/Data/*

In [3]:
pd.read_csv('../data/solfrc_cntrl.dat').head()

Unnamed: 0,87.9 6.8882E+01
0,85.1 6.9243E+01
1,82.3 7.1715E+01
2,79.5 7.3179E+01
3,76.7 8.2814E+01
4,73.9 9.4552E+01


*which has no headers, and separated by space. we can use pandas generic reader to open file*

In [4]:
pd.read_table('../data/solfrc_cntrl.dat').head()

Unnamed: 0,87.9 6.8882E+01
0,85.1 6.9243E+01
1,82.3 7.1715E+01
2,79.5 7.3179E+01
3,76.7 8.2814E+01
4,73.9 9.4552E+01


In [5]:
# let's define separator, what separates each column of given data
pd.read_table('../data/solfrc_cntrl.dat', sep='\s+').head()

Unnamed: 0,87.9,6.8882E+01
0,85.1,69.243
1,82.3,71.715
2,79.5,73.179
3,76.7,82.814
4,73.9,94.552


*\\s+ is a regular expression, which means multiple number of spaces. see References for more*

In [10]:
# now let's give column name for each column
pd.read_table('../data/solfrc_cntrl.dat', header=None, names=['T', 'V'], sep='\s+').head()

Unnamed: 0,T,V
0,87.9,68.882
1,85.1,69.243
2,82.3,71.715
3,79.5,73.179
4,76.7,82.814


*__header=None__, means dont use header from loaded table and use provided by __names__ instead*

Instead of downloading a file into disk and loading them, load them directly from url

In [12]:
pd.read_table('http://data.giss.nasa.gov/gistemp/tabledata_v3/GLB.Ts+dSST.txt')

Unnamed: 0,GLOBAL Land-Ocean Temperature Index in 0.01 degrees Celsius base period: 1951-1980
0,sources: GHCN-v3 1880-06/...
1,using elimination of outli...
2,Notes: 1950 DJF = Dec 1949...
3,...
4,Year Jan Feb Mar Apr May Jun Jul Aug ...
5,1880 -30 -20 -18 -28 -14 -29 -24 -8 ...
6,1881 -9 -14 1 -3 -4 -28 -6 -2 ...
7,1882 10 9 2 -20 -17 -25 -10 4 ...
8,1883 -33 -42 -17 -24 -25 -11 -8 -13 ...
9,1884 -18 -11 -34 -36 -31 -38 -34 -25 ...


*Above dataset has following which we have to normalise*

- dataset is separated by spaces
- has multiple lines of extra info at top
- has multiple lines of extra info at bottom
- is chunked in multiple blocks ( has multiple column name lines in between )

In [16]:
df = pd.read_table('http://data.giss.nasa.gov/gistemp/tabledata_v3/GLB.Ts+dSST.txt', 
                   skiprows=7, skipfooter=7, sep='\s+')

  from ipykernel import kernelapp as app


In [17]:
df

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON,Year.1
0,1880,-30,-20,-18,-28,-14,-29,-24,-8,-17,-16,-19,-21,-20,***,****,-20,-20,-17,1880
1,1881,-9,-14,1,-3,-4,-28,-6,-2,-9,-19,-26,-15,-11,-12,-15,-2,-12,-18,1881
2,1882,10,9,2,-20,-17,-25,-10,4,-1,-22,-21,-25,-10,-9,1,-12,-10,-14,1882
3,1883,-33,-42,-17,-24,-25,-11,-8,-13,-18,-11,-20,-18,-20,-20,-33,-22,-11,-16,1883
4,1884,-18,-11,-34,-36,-31,-38,-34,-25,-23,-22,-30,-29,-27,-27,-15,-34,-32,-25,1884
5,1885,-64,-29,-23,-44,-41,-50,-28,-27,-19,-18,-22,-5,-31,-33,-41,-36,-35,-20,1885
6,1886,-41,-45,-41,-29,-27,-39,-16,-32,-19,-25,-26,-25,-30,-29,-30,-32,-29,-23,1886
7,1887,-66,-48,-31,-37,-33,-20,-19,-27,-20,-32,-26,-37,-33,-32,-46,-34,-22,-26,1887
8,1888,-43,-42,-47,-28,-22,-20,-9,-11,-8,1,-1,-13,-20,-22,-41,-33,-14,-2,1888
9,1889,-20,14,4,4,-3,-12,-5,-18,-18,-22,-30,-31,-11,-10,-6,2,-11,-23,1889


In [20]:
df.shape

(143, 20)

In [21]:
df.dtypes

Year      object
Jan       object
Feb       object
Mar       object
Apr       object
May       object
Jun       object
Jul       object
Aug       object
Sep       object
Oct       object
Nov       object
Dec       object
J-D       object
D-N       object
DJF       object
MAM       object
JJA       object
SON       object
Year.1    object
dtype: object

In [22]:
df.index

RangeIndex(start=0, stop=143, step=1)

In [23]:
df.columns

Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
       'Oct', 'Nov', 'Dec', 'J-D', 'D-N', 'DJF', 'MAM', 'JJA', 'SON',
       'Year.1'],
      dtype='object')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 20 columns):
Year      143 non-null object
Jan       143 non-null object
Feb       143 non-null object
Mar       143 non-null object
Apr       143 non-null object
May       143 non-null object
Jun       143 non-null object
Jul       143 non-null object
Aug       143 non-null object
Sep       143 non-null object
Oct       143 non-null object
Nov       143 non-null object
Dec       143 non-null object
J-D       143 non-null object
D-N       143 non-null object
DJF       143 non-null object
MAM       143 non-null object
JJA       143 non-null object
SON       143 non-null object
Year.1    143 non-null object
dtypes: object(20)
memory usage: 22.4+ KB


In [27]:
type(df["Year"])

pandas.core.series.Series

In [29]:
df["Year"].values

array(['1880', '1881', '1882', '1883', '1884', '1885', '1886', '1887',
       '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1895',
       '1896', '1897', '1898', '1899', '1900', 'Year', '1901', '1902',
       '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910',
       '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918',
       '1919', '1920', 'Year', '1921', '1922', '1923', '1924', '1925',
       '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933',
       '1934', '1935', '1936', '1937', '1938', '1939', '1940', 'Year',
       '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948',
       '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956',
       '1957', '1958', '1959', '1960', 'Year', '1961', '1962', '1963',
       '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971',
       '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', 'Year', '1981', '1982', '1983', '1984', '1985', '1986',
      

In [28]:
type(df["Year"].values)

numpy.ndarray

### Loading a excel file

*Note: this process required __xlrd__ module which is a python library to load/read xls files.*

to install use:

    $ conda install xlrd

In [19]:
pd.read_excel('../data/nepal/tas5_1900_1930.xls', sheetname=)

Unnamed: 0,Temperature (C),Year,Month,Country,ISO3,ISO2
0,2.218293,1901,1,NPL,,
1,4.759756,1901,2,NPL,,
2,9.471951,1901,3,NPL,,
3,14.504878,1901,4,NPL,,
4,16.886585,1901,5,NPL,,
5,20.260975,1901,6,NPL,,
6,19.513414,1901,7,NPL,,
7,18.676830,1901,8,NPL,,
8,17.182926,1901,9,NPL,,
9,14.296341,1901,10,NPL,,


*This loaded first sheet, to select a particular sheet*

```python
pd.read_excel('../data/nepal/tas5_1900_1930.xls', sheetname='tas5_1900_19030')
```

In [None]:
# now combine those two into single dataframe 
df2 = pd.concat([df[['Indicator Name']], df.ix[:,'2010':'2015']], axis=1).head(3)

In [None]:
# see how axis value changes the dataframe
pd.concat([df[['Indicator Name']], df.ix[:,'2010':'2015']], axis=0).head(3)