# Series

In [3]:
import numpy as np
import pandas as pd

In [2]:
labels = ['a', 'b', 'c']

In [3]:
my_data = [10, 20, 30]

In [4]:
arr = np.array(my_data)

In [5]:
d = {'a':10, 'b':20, 'c':30}

In [6]:
 labels

['a', 'b', 'c']

In [7]:
d

{'a': 10, 'b': 20, 'c': 30}

In [8]:
arr

array([10, 20, 30])

In [9]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [10]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [12]:
# It can take a numpy array too
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [13]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int64

In [15]:
# Key are the labels, value are the data
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [16]:
# Panda series can hold a variety of object types
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [17]:
labels

['a', 'b', 'c']

In [19]:
# It can even hold functions
pd.Series(data=[sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [21]:
ser1 = pd.Series([1,2,3,4], ['USA', 'Germany', 'USSR', 'Japan'])

In [22]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [23]:
ser2 = pd.Series([1,2,5,4], ['USA', 'Germany', 'Italy', 'Japan'])

In [24]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [25]:
ser1['USA']

1

In [29]:
ser3 = pd.Series(labels)

In [30]:
ser3

0    a
1    b
2    c
dtype: object

In [31]:
ser3[0]

'a'

In [33]:
# If we add 2 series, it'll join them through the keys and add the values. 
# If the key doesn't exist in one or the other, it'll put a null
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

# DataFrames


In [1]:
from numpy.random import randn

In [4]:
np.random.seed(101)

In [5]:
df = pd.DataFrame(randn(5,4),['A', 'B', 'C', 'D', 'E'],['W','X','Y','Z'])

In [6]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [7]:
# Data Frames are a bunch of series that share an index

In [8]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [9]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [10]:
type(df)

pandas.core.frame.DataFrame

In [12]:
type(df['W'])

pandas.core.series.Series

In [13]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [14]:
df['new'] = df['W'] + df['Y']

In [15]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [16]:
# To drop a column, axis = 0 refers to the index (the rows) and axis = 1 refers to the columns

In [17]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [18]:
# Now to actually modify the dataFrame, you need to set the inplace flag
df.drop('new', axis=1, inplace=True)

In [19]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
df.drop('E') # You can specify axis = 0 but that's the default

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [22]:
df.shape

(5, 4)

In [23]:
df.loc['D']

W    0.188695
X   -0.758872
Y   -0.933237
Z    0.955057
Name: D, dtype: float64

In [24]:
df.iloc[3]

W    0.188695
X   -0.758872
Y   -0.933237
Z    0.955057
Name: D, dtype: float64

# Conditional Selectors

In [25]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [26]:
booldf = df > 0

In [27]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [28]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [29]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [31]:
# If you pass in a series, you don't get null values


In [32]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [36]:
resultdf = df[df['W'] > 0]

In [37]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# Input / Output

In [38]:
pwd

'/Users/admin/Documents/DataScience'

pd.read_csv('<filename>', index=False)