# Index

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('datas/ml-latest-small/ratings.csv')

In [3]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
data.count()

userId       100836
movieId      100836
rating       100836
timestamp    100836
dtype: int64

## Use index to look up data

In [5]:
# 'drop = False' Let index_column be there
data.set_index('userId', inplace = True, drop = False)

In [6]:
data.head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,4.0,964982703
1,1,3,4.0,964981247
1,1,6,4.0,964982224
1,1,47,5.0,964983815
1,1,50,5.0,964982931


In [7]:
data.index

Int64Index([  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
            ...
            610, 610, 610, 610, 610, 610, 610, 610, 610, 610],
           dtype='int64', name='userId', length=100836)

In [8]:
data.loc[500].head(5)

Unnamed: 0_level_0,userId,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
500,500,1,4.0,1005527755
500,500,11,1.0,1005528017
500,500,39,1.0,1005527926
500,500,101,1.0,1005527980
500,500,104,4.0,1005528065


In [9]:
data.loc[data['userId'] == 500].head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
500,500,1,4.0,1005527755
500,500,11,1.0,1005528017
500,500,39,1.0,1005527926
500,500,101,1.0,1005527980
500,500,104,4.0,1005528065


## Use index to improve performance

In [10]:
from sklearn.utils import shuffle
data_shuffle = shuffle(data)

In [11]:
data_shuffle.head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
606,606,2360,5.0,1171362259
474,474,7888,3.5,1099315952
520,520,3793,4.0,1326609266
520,520,333,4.0,1326609984
434,434,5349,3.5,1270604360


In [12]:
data_shuffle.index.is_monotonic_increasing

False

In [13]:
data_shuffle.index.is_unique

False

In [14]:
%timeit data_shuffle.loc[500]

506 µs ± 96.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
data_sorted = data_shuffle.sort_index()

In [16]:
data_sorted.head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,2012,4.0,964984176
1,1,1127,4.0,964982513
1,1,235,4.0,964980908
1,1,2470,5.0,964982588
1,1,3617,4.0,964980683


In [17]:
%timeit data_sorted.loc[500]

203 µs ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Align

In [18]:
s1 = pd.Series([1,2,3], index = list('abc'))

In [19]:
s1

a    1
b    2
c    3
dtype: int64

In [20]:
s2 = pd.Series([4,5,6], index = list('bcd'))

In [21]:
s2

b    4
c    5
d    6
dtype: int64

In [22]:
s1+s2

a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64