# Nested List Comprehension

In [60]:
all_data=[["John","Emily","Michael","Mary","Steven"],
         ["Maria","Juan","Javier","Natalia","Pilar"]]

print(all_data)

[['John', 'Emily', 'Michael', 'Mary', 'Steven'], ['Maria', 'Juan', 'Javier', 'Natalia', 'Pilar']]


We want to get a single list containing all names with two or more a's in them.

In [61]:
names_of_interest=[]

for names in all_data:
    enough_as=[name for name in names if name.count("a")>=2]
    names_of_interest.extend(enough_as)

names_of_interest

['Maria', 'Natalia']

In [62]:
result=[name for names in all_data for name in names if name.count("a")>=2]

result

['Maria', 'Natalia']

# Functions

Functions are the primary and most important method of code organization and reuse in python.

Functions are declared with the def keyowrd. A function contains a block of code with an optional use of the return keyword.

In [63]:
def my_function(x,y):
    return x+y

In [64]:
my_function(10,20)

30

In [65]:
result=my_function(20,30)
print(result)

50


In [66]:
def func_witht_return(x):
    print(x)

result=func_witht_return("hello!")

print(result)

hello!
None


Positional Arguments

Keyword Arguments

In [67]:
def my_function(x,y,z=1.5):
    if z>1:
        return z*(x+y)
    else:
        return z/(x+y)

In [68]:
my_function(5,6)

16.5

In [69]:
my_function(5,6,z=0.7)

0.06363636363636363

In [70]:
my_function(x=10,y=20,z=30)

900

In [71]:
my_function(5,6,0.7)

0.06363636363636363

In [72]:
def func():
    a=[]
    for i in range(5):
        a.append(i)
func()

print(a)

5


In [73]:
a=[]

def func():
    for i in range(5):
        a.append(i)

func()

print(a)

[0, 1, 2, 3, 4]


In [74]:
def func():
    global a
    a=[]
    for i in range(5):
        a.append(i)

func()

print(a)

[0, 1, 2, 3, 4]


In [75]:
def f():
    a=5
    b=6
    c=7
    return a,b,c

a,b,c=f()
print(a,b,c)

5 6 7


In [76]:
def f():
    a=5
    b=6
    c=7
    return {"a":a,"b":b,"c":c}

f()

{'a': 5, 'b': 6, 'c': 7}

In [77]:
states=["  Alabama  ","Georgia!","georgia","Georgia","FlOrida",
       "south carolina##","West virginia?"]

import re   # regular expressions

def clean_strings(strings):
    result=[]
    for value in strings:
        value=value.strip()
        value=re.sub("[!#?]","",value)
        value=value.title()
        result.append(value)
    return result

clean_strings(states)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

Lambda Functions

Python has support for anonymous or lambda functions, which are a way of writing functions consisting of a single statement, the result of which is the return value.

In [78]:
def short_function(x):
    return x*2

short_function(20)

40

In [79]:
equiv= lambda x:x*2

equiv(20)

40

In [80]:
equiv= lambda x,y:x*y*2

equiv(20,40)

1600

In [81]:
def apply_to_list(some_list,f):
    return [f(x) for x in some_list]

ints=[4,0,1,5,6]

apply_to_list(ints, lambda x:x*2)

[8, 0, 2, 10, 12]

In [82]:
strings=["foo","card","bar","aaaa","abab"]

strings.sort(key=lambda x:len(set(x)))

strings

['aaaa', 'foo', 'abab', 'bar', 'card']

# Errors and Exception Handling

In [83]:
float("1.2345")

1.2345

In [84]:
float("something")

ValueError: could not convert string to float: 'something'

In [None]:
def attempt_float(x):
    try:
        return float(x)
    except:
        return x

The code in the except part of the block will only be executed if float(x) raises and exception

In [None]:
attempt_float("1.2345")

In [None]:
attempt_float("something")

# NumPy

NumPy, short for Numerical Python, is one of the most important foundational packages for numerical computing in Python

In [85]:
import numpy as np

my_arr=np.arange(1_000_000)
print(my_arr)

my_list=list(range(1_000_000))
print(my_list[1:10])

[     0      1      2 ... 999997 999998 999999]
[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [86]:
%timeit my_arr2=my_arr*2

1.6 ms ± 132 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [87]:
%timeit my_list2=[x*2 for x in my_list]

92.9 ms ± 5.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


One of the key features of NumPy is its N-dimensional array object or ndarray, which is fast, flexible container for large datasets in Python.

In [88]:
data=np.array([[1.5,0.1,3],[0,-3,6.5]])

data

array([[ 1.5,  0.1,  3. ],
       [ 0. , -3. ,  6.5]])

In [89]:
data.shape

(2, 3)

In [90]:
data.ndim

2

In [91]:
data*10

array([[ 15.,   1.,  30.],
       [  0., -30.,  65.]])

In [92]:
data+data

array([[ 3. ,  0.2,  6. ],
       [ 0. , -6. , 13. ]])

In [93]:
data.shape

(2, 3)

In [94]:
data.dtype

dtype('float64')

In [95]:
data1=[6,7.5,8.0,1]

arr1=np.array(data1)

print(arr1)

[6.  7.5 8.  1. ]


In [96]:
arr1.ndim

1

In [97]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [98]:
np.zeros((3,6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [99]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [100]:
arr1=np.array([1,2,3],dtype=np.float64)

arr2=np.array([1,2,3],dtype=np.int32)

print(arr1.dtype)

print(arr2.dtype)

float64
int32


In [101]:
arr=np.array([1,2,3,4,5])

print(arr.dtype)

float_arr=arr.astype(np.float64)

print(float_arr.dtype)

print(float_arr)

int32
float64
[1. 2. 3. 4. 5.]


In [102]:
arr=np.array([[1.,2.,3.],[4.,5.,6.]])

arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [103]:
arr*arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [104]:
arr-arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [105]:
1/arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [106]:
arr**2

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [107]:
arr2=np.array([[0.,4.,1.],[7.,2.,12.]])

arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [108]:
arr2>arr

array([[False,  True, False],
       [ True, False,  True]])

In [109]:
arr=np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [110]:
arr[5]

5

In [111]:
arr[5:8]

array([5, 6, 7])

In [112]:
arr[5:8]=12
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [113]:
arr_slice=arr[5:8]

arr_slice

array([12, 12, 12])

In [114]:
arr_slice[1]=12345
arr_slice

array([   12, 12345,    12])

In [115]:
arr2d=np.array([[1,2,3],[4,5,6],[7,8,9]])

print(arr2d)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [116]:
arr2d[2]

array([7, 8, 9])

In [117]:
arr2d[2][1]

8

In [118]:
arr2d[2,1]

8

In [119]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [120]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [121]:
arr2d[:2,1:]

array([[2, 3],
       [5, 6]])

In [122]:
arr=np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [123]:
arr=np.arange(15).reshape((5,3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [124]:
arr.T

array([[ 0,  3,  6,  9, 12],
       [ 1,  4,  7, 10, 13],
       [ 2,  5,  8, 11, 14]])

In [125]:
arr=np.array([[0,1,0],[1,2,-2],[6,3,2],[-1,0,-1],[1,0,1]])
arr

array([[ 0,  1,  0],
       [ 1,  2, -2],
       [ 6,  3,  2],
       [-1,  0, -1],
       [ 1,  0,  1]])

In [126]:
np.dot(arr.T,arr)

array([[39, 20, 12],
       [20, 14,  2],
       [12,  2, 10]])

In [127]:
arr.T@arr

array([[39, 20, 12],
       [20, 14,  2],
       [12,  2, 10]])

# Pandas

Pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convinient in Python.

Series& DataFrame

Series is a one-dimensional array like object containing a sequence of value

In [128]:
import pandas as pd

In [129]:
obj= pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [130]:
obj2= pd.Series([4,7,-5,3],index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [131]:
#!pip install pandas

In [132]:
obj2["a"]

-5

In [133]:
obj2["d"]=6
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [134]:
obj2[["c","a","d"]]

c    3
a   -5
d    6
dtype: int64

In [135]:
obj2= pd.Series([4,7,-5,3,5],index=["d","b","a","a","c"])
obj2

d    4
b    7
a   -5
a    3
c    5
dtype: int64

In [136]:
obj2[obj2>0]

d    4
b    7
a    3
c    5
dtype: int64

In [137]:
obj2*2

d     8
b    14
a   -10
a     6
c    10
dtype: int64

In [138]:
import numpy as np

np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
a      20.085537
c     148.413159
dtype: float64

In [139]:
sdata={"Ohio":35000,"Texas":71000,"Oregon":16000,"Utah":5000}

obj3=pd.Series(sdata)

In [140]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Data Frame

A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns each of which can be a different value type.

The DataFrame has both a row index and column index

In [141]:
data={"states":["Ohio","Ohio","Ohio","Nevada","Nevada","Nevada"],
      "year":[2000,2001,2002,2001,2002,2003],
      "pop":[1.5,1.7,3.6,2.4,2.9,3.2]
    }

frame=pd.DataFrame(data)

In [142]:
frame

Unnamed: 0,states,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [143]:
frame.head()

Unnamed: 0,states,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [144]:
frame.tail()

Unnamed: 0,states,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [145]:
pd.DataFrame(data, columns=["year","states","pop"])

Unnamed: 0,year,states,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [146]:
frame2=pd.DataFrame(data, columns=["year","states","pop","debt"])
frame2

Unnamed: 0,year,states,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [147]:
frame2.columns

Index(['year', 'states', 'pop', 'debt'], dtype='object')

In [148]:
frame2["states"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: states, dtype: object

In [149]:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [150]:
frame2[["states","year"]]

Unnamed: 0,states,year
0,Ohio,2000
1,Ohio,2001
2,Ohio,2002
3,Nevada,2001
4,Nevada,2002
5,Nevada,2003


In [151]:
frame2.loc[1]

year      2001
states    Ohio
pop        1.7
debt       NaN
Name: 1, dtype: object

In [152]:
frame2.iloc[2]

year      2002
states    Ohio
pop        3.6
debt       NaN
Name: 2, dtype: object

In [153]:
frame2["debt"]=16.5
frame2

Unnamed: 0,year,states,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [154]:
frame2["debt"]=np.arange(6.)
frame2

Unnamed: 0,year,states,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [155]:
frame2["eastern"]=frame2["states"]=="Ohio"
frame2

Unnamed: 0,year,states,pop,debt,eastern
0,2000,Ohio,1.5,0.0,True
1,2001,Ohio,1.7,1.0,True
2,2002,Ohio,3.6,2.0,True
3,2001,Nevada,2.4,3.0,False
4,2002,Nevada,2.9,4.0,False
5,2003,Nevada,3.2,5.0,False


In [156]:
del frame2["eastern"]

frame2.columns

Index(['year', 'states', 'pop', 'debt'], dtype='object')

In [157]:
frame2.T

Unnamed: 0,0,1,2,3,4,5
year,2000,2001,2002,2001,2002,2003
states,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,0.0,1.0,2.0,3.0,4.0,5.0


In [158]:
frame2.index.name="year"

frame2.columns.name="state"

frame2

state,year,states,pop,debt
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [159]:
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=["Ohio","Colorado","Utah","New York"],
                 columns=["one","two","three","four"])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [160]:
data["two"]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [161]:
data[["three","one"]]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [162]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [163]:
data[data["three"]>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [164]:
data[data<5]=0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [165]:
data.loc["Colorado"]

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [166]:
data.loc[["Colorado","New York"],["two","three"]]

Unnamed: 0,two,three
Colorado,5,6
New York,13,14
