## 11. How to bin a numeric series to 10 groups of equal size?

In [3]:
import pandas as pd
import numpy as np

ser = pd.Series(np.random.random(20))
print(ser.head())

# Solution
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    0.342317
1    0.147062
2    0.482797
3    0.886630
4    0.268430
dtype: float64


0    3rd
1    1st
2    4th
3    9th
4    2nd
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

## 12. How to convert a numpy array to a dataframe of given shape? (L1)

In [4]:
# Input
ser = pd.Series(np.random.randint(1, 10, 35))

In [5]:
ser

0     9
1     3
2     7
3     3
4     5
5     5
6     1
7     6
8     8
9     3
10    3
11    8
12    6
13    7
14    2
15    4
16    3
17    3
18    8
19    8
20    3
21    5
22    7
23    4
24    4
25    3
26    8
27    1
28    3
29    4
30    8
31    7
32    9
33    4
34    7
dtype: int64

In [6]:
# Solution
df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  9  3  7  3  5
1  5  1  6  8  3
2  3  8  6  7  2
3  4  3  3  8  8
4  3  5  7  4  4
5  3  8  1  3  4
6  8  7  9  4  7


## 13. How to find the positions of numbers that are multiples of 3 from a series?

In [7]:
# Input
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    1
1    9
2    4
3    6
4    1
5    7
6    6
dtype: int64

In [8]:
print(ser)
np.argwhere(ser % 3==0)

0    1
1    9
2    4
3    6
4    1
5    7
6    6
dtype: int64


array([[1],
       [3],
       [6]])

## 14. How to extract items at given positions from a series

In [9]:
# Input
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

In [10]:
# Solution
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

## 15. How to stack two series vertically and horizontally ?

In [11]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [12]:
# Output
# Vertical
ser1.append(ser2)

# Horizontal
df = pd.concat([ser1, ser2], axis=1)
print(df)

   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [13]:
ser1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [14]:
ser2

0    a
1    b
2    c
3    d
4    e
dtype: object

In [15]:
ser3 = ser1.append(ser2)

In [16]:
ser3

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

## 16. How to get the positions of items of series A in another series B?

In [17]:
# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [18]:
# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

## 17. How to compute the mean squared error on a truth and predicted series?

In [19]:
# Input
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

In [20]:
print(truth)
print(pred)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
0    0.189809
1    1.028711
2    2.086924
3    3.888739
4    4.738689
5    5.079146
6    6.312122
7    7.132472
8    8.689717
9    9.766905
dtype: float64


In [21]:
np.mean((truth-pred)**2)

0.2565012160212073

## 18. How to convert the first character of each element in a series to uppercase?

In [22]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [23]:
# Solution 1
ser.map(lambda x: x.title())

# Solution 2
ser.map(lambda x: x[0].upper() + x[1:])

# Solution 3
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [24]:
a = 'how'
print(a.title())

How


In [25]:
ser

0     how
1      to
2    kick
3    ass?
dtype: object

## 19. How to calculate the number of characters in each word in a series?

In [26]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [27]:
# Solution
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

## 20. How to compute difference of differences between consequtive numbers of a series?

In [28]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [29]:
# Solution
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


## 21. How to convert a series of date-strings to a timeseries?

In [30]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [31]:
# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

## 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [32]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [33]:
# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


## 24. How to filter words that contain atleast 2 vowels from a series?

In [34]:
# Input
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

## 25. How to filter valid emails from a series?

In [35]:
# Input
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])

# Solution 1 (as series of strings)
import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)

# Solution 3 (as list)
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

## 26. How to get the mean of a series grouped by another series?

In [37]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'apple', 'carrot', 'banana', 'banana', 'banana', 'apple', 'banana', 'apple', 'apple']


In [42]:
table = pd.concat([fruit,weights],axis=1)
print(table)

        0     1
0  banana   1.0
1   apple   2.0
2  carrot   3.0
3  banana   4.0
4  banana   5.0
5  banana   6.0
6   apple   7.0
7  banana   8.0
8   apple   9.0
9   apple  10.0


In [43]:
result = table.groupby(0).mean()

In [44]:
result

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
apple,7.0
banana,4.8
carrot,3.0


In [45]:
# Solution
weights.groupby(fruit).mean()

apple     7.0
banana    4.8
carrot    3.0
dtype: float64

## 27. How to compute the euclidean distance between two series?

In [46]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [47]:
# Solution 
sum((p - q)**2)**.5

# Solution (using func)
np.linalg.norm(p-q)

18.16590212458495

## 28. How to find all the local maxima (or peaks) in a numeric series?

In [48]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [52]:
 dd = np.diff(np.sign(np.diff(ser)))

In [55]:
peak_locs = np.where(dd==-2)[0] + 1 

In [56]:
peak_locs

array([1, 5, 7])

## 29. How to replace missing spaces in a string with the least frequent character?

In [57]:
# Input
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
     3
b    3
e    3
a    2
g    1
c    1
dtype: int64


'dbccdebcabedcgade'

## 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [58]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    7
2000-01-08    2
2000-01-15    8
2000-01-22    3
2000-01-29    1
2000-02-05    8
2000-02-12    2
2000-02-19    8
2000-02-26    9
2000-03-04    8
Freq: W-SAT, dtype: int64