In [5]:
# ***********************************************HANDLING MISSING DATA:
import numpy as np
import pandas as pd

data=pd.Series([1, np.nan, 3.5, np.nan, 7])
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [7]:
df=pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
df[4]=np.nan
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [9]:
df.dropna(axis=1, how='all')  # delete only columns where all values are equal NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
rng=np.random.RandomState(42)
df3=rng.randn(7,3) # 7 rows, 3 columns:
df3=pd.DataFrame(data=df3)
df3

Unnamed: 0,0,1,2
0,0.496714,-0.138264,0.647689
1,1.52303,-0.234153,-0.234137
2,1.579213,0.767435,-0.469474
3,0.54256,-0.463418,-0.46573
4,0.241962,-1.91328,-1.724918
5,-0.562288,-1.012831,0.314247
6,-0.908024,-1.412304,1.465649


In [19]:
df3.iloc[:4, 1]=np.nan
df3.iloc[:2, 2]=np.nan
df3

Unnamed: 0,0,1,2
0,0.496714,,
1,1.52303,,
2,1.579213,,-0.469474
3,0.54256,,-0.46573
4,0.241962,-1.91328,-1.724918
5,-0.562288,-1.012831,0.314247
6,-0.908024,-1.412304,1.465649


In [22]:
# drop null values with THRESH argument determining at least how many NON -NULL-VALUES each row (default) must contain 
# to survive; axis=1 to verify columns
df3.dropna(thresh=1) # no rows are deleted
df3.dropna(thresh=2) # 2 first rows are deleted

Unnamed: 0,0,1,2
2,1.579213,,-0.469474
3,0.54256,,-0.46573
4,0.241962,-1.91328,-1.724918
5,-0.562288,-1.012831,0.314247
6,-0.908024,-1.412304,1.465649


In [25]:
# fillna - with dictionary allows to fill different values for each column:
filled=df3.fillna({1:'xx', 2:'yy'})
filled

Unnamed: 0,0,1,2
0,0.496714,xx,yy
1,1.52303,xx,yy
2,1.579213,xx,-0.469474
3,0.54256,xx,-0.46573
4,0.241962,-1.91328,-1.724918
5,-0.562288,-1.012831,0.314247
6,-0.908024,-1.412304,1.465649


In [31]:
# fillna with method and limit - limit determines how many records in each column should be filled
df4=pd.DataFrame(rng.randn(21).reshape(7,3))
df4.iloc[4:, 1]=np.nan

Unnamed: 0,0,1,2
0,-0.342715,-0.802277,-0.161286
1,0.404051,1.886186,0.174578
2,0.25755,-0.074446,-1.918771
3,-0.026514,0.06023,2.463242
4,-0.192361,,-0.034712
5,-1.168678,,0.751933
6,0.791032,,1.402794


In [33]:
filled2=df4.fillna(method='ffill', limit=1) 
filled2

Unnamed: 0,0,1,2
0,-0.342715,-0.802277,-0.161286
1,0.404051,1.886186,0.174578
2,0.25755,-0.074446,-1.918771
3,-0.026514,0.06023,2.463242
4,-0.192361,0.06023,-0.034712
5,-1.168678,,0.751933
6,0.791032,,1.402794


In [39]:
#removing duplicates:
data=pd.DataFrame({'a':['one', 'two']*3 + ['two'], 
                  'b': [1, 1, 2, 3, 3, 4, 4]})
data 

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [44]:
data.duplicated(['a']) # True if the previous row (all records, or providing the column name as a 
# parameter allows to verify only values in this column) is the same

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [42]:
data.drop_duplicates() # remains only rows with False result from data.duplicated(), by default not in place

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [46]:
data['c']=range(7)
data

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [47]:
data.drop_duplicates(['a', 'b'], keep='last') # by default the 1st row of duplicated elements remains (5th value in 'c' column)

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [48]:
# trasforming & mapping:
df=pd.DataFrame({'item': ['apple', 'banana', 'Apple', 'orange', 'Banana', 'Tomato', 'Orange', 'tomato'],
                'number':[10,20,3,24,5,67,45,6]})
df

Unnamed: 0,item,number
0,apple,10
1,banana,20
2,Apple,3
3,orange,24
4,Banana,5
5,Tomato,67
6,Orange,45
7,tomato,6


In [56]:
# adding new column with color name using dictionary -> firstly converting letters from upper to lower case:
color={'apple':'green', 
       'banana':'yellow', 
       'orange':'orange', 
       'tomato':'red'}

lowered=df['item'].str.lower()
lowered
df['color']=lowered.map(color)
df

Unnamed: 0,item,number,color
0,apple,10,green
1,banana,20,yellow
2,Apple,3,green
3,orange,24,orange
4,Banana,5,yellow
5,Tomato,67,red
6,Orange,45,orange
7,tomato,6,red


In [62]:
# or by using lambda:
df['color2']=df['item'].map(lambda x: color[x.lower()])
df

Unnamed: 0,item,number,color,color2
0,apple,10,green,green
1,banana,20,yellow,yellow
2,Apple,3,green,green
3,orange,24,orange,orange
4,Banana,5,yellow,yellow
5,Tomato,67,red,red
6,Orange,45,orange,orange
7,tomato,6,red,red


In [66]:
# Renaming axis indexes:
df=pd.DataFrame(np.arange(12).reshape(3,4), 
               index=['Ohio', 'Colorado', 'New York'],
               columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [74]:
# renaming indexes using map & lambda:
df.index=df.index.map(lambda x: x[:4].upper()) 
df

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [80]:
# rename - does not modify the original df:
df.rename(columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [81]:
df.rename(index=str.title) # title - transforms to upper case the first letter only

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [87]:
# rename + dictionary to change only selected columns/indexes:
df.rename(columns={'two':'tWo'}, 
          index={'COLO':'XXX'})

Unnamed: 0,one,tWo,three,four
OHIO,0,1,2,3
XXX,4,5,6,7
NEW,8,9,10,11


In [88]:
df

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [107]:
# binning - as a bin parameter might be provided the list of already defined bins or the integer value determining on how many
# bins data should be split between min and max values:

df=pd.DataFrame({'a':[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32],
                'b':np.arange(12)})
df['bins']=pd.cut(x=df['a'], bins=[18, 25, 35, 60, 100]) # already defined bins
df['bins_labels']=pd.cut(x=df['a'], bins=[18, 25, 35, 60, 100], labels= ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'])
df['bins_equal']=pd.cut(x=df['a'], bins=6)  # splits on 6 equal groups between min and max values
df

Unnamed: 0,a,b,bins,bins_labels,bins_equal
0,20,0,"(18, 25]",Youth,"(19.959, 26.833]"
1,22,1,"(18, 25]",Youth,"(19.959, 26.833]"
2,25,2,"(18, 25]",Youth,"(19.959, 26.833]"
3,27,3,"(25, 35]",YoungAdult,"(26.833, 33.667]"
4,21,4,"(18, 25]",Youth,"(19.959, 26.833]"
5,23,5,"(18, 25]",Youth,"(19.959, 26.833]"
6,37,6,"(35, 60]",MiddleAged,"(33.667, 40.5]"
7,31,7,"(25, 35]",YoungAdult,"(26.833, 33.667]"
8,61,8,"(60, 100]",Senior,"(54.167, 61.0]"
9,45,9,"(35, 60]",MiddleAged,"(40.5, 47.333]"


In [105]:
df['bins'].values.codes   # to which group of bins each row is assigned

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [106]:
df['bins'].values.categories  # lists unique categories, by default right brackets are closed = inclusive; right=False allows 
# to change them to parenthesis (exclusive)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [100]:
pd.value_counts(df['bins'])    # how many rows belong to each bin

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: bins, dtype: int64

In [114]:
# DETECTING AND FILTERING OUTLIERS:
df=pd.DataFrame(rng.randn(1000,4)) # 2D array with 1000 rows and 4 columns
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009864,0.02729,0.060381,0.011463
std,0.971109,1.042364,0.99915,1.029485
min,-2.936201,-3.094289,-3.329504,-3.6352
25%,-0.627535,-0.675661,-0.627219,-0.711897
50%,0.013343,0.043274,0.017144,0.025144
75%,0.631931,0.722566,0.753409,0.697749
max,4.479084,3.284118,3.42891,3.285724


In [115]:
df[abs(df)>3]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [131]:
df[abs(df[2])>3][2] # displaying values that exceeds 3 or -3 in the 2nd column
df[(abs(df)>3).any(1)]  # displaying rows where at least one record exceeds 3 or -3

Unnamed: 0,0,1,2,3
140,-1.333217,3.284118,0.783304,-0.267004
255,-1.707343,-0.584229,3.040687,-0.968013
264,0.10667,0.747059,-1.321209,3.096589
269,-0.163634,1.044386,3.42891,0.115152
324,0.339529,3.186575,1.344434,0.701131
348,-0.367022,-3.082562,0.40143,-0.643872
449,-0.523104,-0.866571,0.349012,3.285724
608,-1.157934,-0.064398,-3.329504,0.247333
626,-1.155008,-2.398163,-1.408431,-3.250333
691,-1.894834,3.140423,-0.149585,1.038728


In [133]:
# PERMUTATION AND RANDOM SAMPLING:
df=pd.DataFrame(np.arange(20).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [135]:
sampler=rng.permutation(5) #produces an array of integers indicating the new ordering
sampler # such reordered array might be used with iloc or take functions:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
4,16,17,18,19


In [136]:
df.take(sampler) # numpy function

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
4,16,17,18,19


In [142]:
df.sample(n=3) # takes 3 random rows
df.sample(n=10, replace=True) # if n exceeds the actual number of rows the replace=True parameter must be added - allows to 
# multiply existing rows

Unnamed: 0,0,1,2,3
1,4,5,6,7
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [143]:
# converting categorical variables:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [147]:
df_dummy = pd.get_dummies(df['key'], prefix='key')

In [149]:
df_all=df.join(df_dummy)
df_all

Unnamed: 0,key,data1,key_a,key_b,key_c
0,b,0,0,1,0
1,b,1,0,1,0
2,a,2,1,0,0
3,c,3,0,0,1
4,a,4,1,0,0
5,b,5,0,1,0


In [163]:
# *************************************************STRING MANIPULATION:
phrase= 'a,b, guido'
splitted=[x.strip() for x in phrase.split(',')]
splitted

['a', 'b', 'guido']

In [167]:
'::'.join(splitted)

'a::b::guido'

In [171]:
# detecting a substring -> index vs find: index raises error when substring is not found, find returns -1:
phrase.index('b') # = 2, commas are taken into consideration as well
phrase.find('c')

-1

In [173]:
# REGULAR EXPRESSION - REGEX
import re
text = "foo bar\t baz \tqux"
print(text)

foo bar	 baz 	qux


In [175]:
#the regular expression is first compiled, and then its split method is called on the passed text.

new_t=re.split('\s+', text)  #\s+ => one or more whitespace characters
new_t

['foo', 'bar', 'baz', 'qux']

In [177]:
# or: 
regex=re.compile('\s+')     # declaring pattern we are looking for
                            # recommended if the same expression is applied to many strings (will save CPU cycles)
new_t2=regex.split(text)
new_t2

['foo', 'bar', 'baz', 'qux']

In [178]:
# looking for the list of all patterns matching the regex:
regex.findall(text)

[' ', '\t ', ' \t']

In [262]:
# exctracting substring according to the declared pattern:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

correct_pattern='[0-9A-Z._+-]+@[0-9A-Z.-]+\.[A-Z]{2,4}'  #{2,4}-length 
regex=re.compile(correct_pattern, flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [250]:
# adding parenthesis allows to extract groups:
re.findall('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})', text, flags=re.IGNORECASE)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [191]:
print(regex.sub('REPLACED', text)) # sub will replace matched pattern with declared phrase

# subn replaces first n occurences

Dave REPLACED
Steve REPLACED
Rob REPLACED
Ryan REPLACED



In [269]:
m=re.match(pattern='([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})', string='wesm@bright.net', flags=re.IGNORECASE)
m.groups()

('wesm', 'bright', 'net')

In [234]:
textn='''xxx12 x3 yy6y7 567'''
pattern='\d+'  #d=any number
# regex=re.compile(pattern, flags=re.IGNORECASE)
print(re.sub(pattern=pattern, repl='*', string=textn, flags=re.IGNORECASE))
print(re.subn(pattern=pattern, repl='*', string=textn, flags=re.IGNORECASE, count=3)) # replaces only 3 first occurences

xxx* x* yy*y* *
('xxx* x* yy*y7 567', 3)


In [246]:
# VECTORIZED STRING FUNCTIONS:
data = pd.Series({'Dave': 'aaa dave@google.com', 
                     'Steve': 'steve@gmail.com', 
                     'Rob': 'rob@gmail.com', 
                     'Wes': np.nan})
data

Dave     aaa dave@google.com
Steve        steve@gmail.com
Rob            rob@gmail.com
Wes                      NaN
dtype: object

In [248]:
data.str.findall('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})', flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [275]:
data.str.index('@') 

Dave     8.0
Steve    5.0
Rob      3.0
Wes      NaN
dtype: float64

In [277]:
data.str.count('@')

Dave     1.0
Steve    1.0
Rob      1.0
Wes      NaN
dtype: float64

In [276]:
data.str[:5]

Dave     aaa d
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object