In [26]:
import pandas as pd
import numpy as np
import seaborn as sns

In [27]:
# create a small dictionary with different data types

dft = pd.DataFrame(dict(A = np.random.rand(3),
                        B = 1,
                        C = 'foo',
                        D = pd.Timestamp('20010102'),
                        E = pd.Series([1.0]*3).astype('float32'),
                                F = False,
                                G = pd.Series([1]*3,dtype='int8')))

dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.550694,1,foo,2001-01-02,1,False,1
1,0.941877,1,foo,2001-01-02,1,False,1
2,0.239474,1,foo,2001-01-02,1,False,1


In [28]:
# There is a really easy way to see what kind of dtypes 
# are in each column. 

dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [29]:
# If a pandas object contains data multiple dtypes IN A 
# SINGLE COLUMN, the dtype of the column will be chosen 
# to accommodate all of the data types (object is the 
# most general).
# these ints are coerced to floats

pd.Series([1, 2, 3, 4, 5, 6.])

0    1
1    2
2    3
3    4
4    5
5    6
dtype: float64

In [30]:
# string data forces an ``object`` dtype

pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3      6
4    foo
dtype: object

In [8]:
# The method get_dtype_counts() will return the number 
# of columns of each type in a DataFrame:

dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [31]:
# create a small data frame. 

df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,1.554321,0.684603,0.696794,-0.299315
1,-0.137118,-0.706663,0.52174,-0.746709
2,0.071512,2.265974,-0.464186,-0.609569
3,-0.464532,-1.172149,-0.461146,-0.595529
4,-1.445596,0.933018,1.86595,-1.200568


In [32]:
# Use df.apply to find the square root of all the values. 
# NaN means not a number

#df.apply(np.sqrt)
df.apply(min)

a   -1.445596
b   -1.172149
c   -0.464186
d   -1.200568
dtype: float64

In [33]:
# find the mean of all of the columns

df.apply(np.mean, axis=0)#axis means its applying to rows, but axis =0 still means columns

a   -0.084283
b    0.400956
c    0.431830
d   -0.690338
dtype: float64

In [15]:
# find the mean of all of the rows

df.apply(np.mean, axis=1)#working on the first row
#df.applymap(#this would do the function element-wise, to each cell)
df.applymap(min)

TypeError: ("'numpy.float64' object is not iterable", u'occurred at index a')

In [34]:
# Let's create a random array with 50 numbers, ranging 
# from 0 to 7.

data = np.random.randint(0, 7, size = 50)
data

array([3, 2, 0, 3, 3, 4, 1, 2, 1, 0, 3, 5, 3, 4, 5, 0, 0, 1, 5, 1, 1, 4, 3,
       3, 5, 2, 0, 0, 3, 4, 1, 2, 0, 1, 4, 0, 5, 5, 5, 1, 0, 2, 2, 2, 3, 0,
       5, 6, 5, 3])

In [35]:
# convert the array into a series

s = pd.Series(data)

In [36]:
# How many of each number is there in the series? Enter 
# value_counts()

x = pd.value_counts(s)
x.sns.hist()
#use functions on a series -- distinguish between series and df
#use .applymap() with your own functions to clean data



AttributeError: 'Series' object has no attribute 'sns'