In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 25
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 82
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [2]:
str_dat = pd.Series(['aa', 'ar', np.nan, 'Cd', None])
str_dat 

0      aa
1      ar
2     NaN
3      Cd
4    None
dtype: object

In [3]:
str_dat.isna()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [4]:
str_dat[0] = None
str_dat.isna()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [5]:
from numpy import nan as NA
data = pd.Series(['aa', 'ar', NA, 'Cd', NA])
data

0     aa
1     ar
2    NaN
3     Cd
4    NaN
dtype: object

In [6]:
data.dropna()

0    aa
1    ar
3    Cd
dtype: object

In [9]:
data[data.notna()]

0    aa
1    ar
3    Cd
dtype: object

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
df = pd.DataFrame(np.random.randn(7,3 ))
df

Unnamed: 0,0,1,2
0,-1.541996,-0.970736,-1.30703
1,0.28635,0.377984,-0.753887
2,0.331286,1.349742,0.069877
3,0.246674,-0.011862,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [27]:
df.iloc[:4, 1] = NA
df.iloc[2, 2] = NA
df

Unnamed: 0,0,1,2
0,-1.541996,,-1.30703
1,0.28635,,-0.753887
2,0.331286,,
3,0.246674,,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [36]:
df.dropna(thresh=1, axis=1)

Unnamed: 0,0,1,2
0,-1.541996,,-1.30703
1,0.28635,,-0.753887
2,0.331286,,
3,0.246674,,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [39]:
df.fillna({1:6, 2:8})

Unnamed: 0,0,1,2
0,-1.541996,6.0,-1.30703
1,0.28635,6.0,-0.753887
2,0.331286,6.0,8.0
3,0.246674,6.0,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [41]:
df.fillna({1:3., 2:4.}, inplace=True)

In [42]:
df

Unnamed: 0,0,1,2
0,-1.541996,3.0,-1.30703
1,0.28635,3.0,-0.753887
2,0.331286,3.0,4.0
3,0.246674,3.0,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [45]:
df.iloc[:4, 1] = NA
df.iloc[2, 2] = NA
df.iloc[0, 1] = 4.5
df

Unnamed: 0,0,1,2
0,-1.541996,,-1.30703
1,0.28635,4.5,-0.753887
2,0.331286,,
3,0.246674,,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [47]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2
0,-1.541996,-1.541996,-1.30703
1,0.28635,4.5,-0.753887
2,0.331286,0.331286,0.331286
3,0.246674,0.246674,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [49]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,-1.541996,,-1.30703
1,0.28635,4.5,-0.753887
2,0.331286,4.5,-0.753887
3,0.246674,,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [50]:
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [51]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [52]:
df

Unnamed: 0,0,1,2
0,-1.541996,,-1.30703
1,0.28635,4.5,-0.753887
2,0.331286,,
3,0.246674,,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [61]:
df.fillna(df.mean(axis=0))

Unnamed: 0,0,1,2
0,-1.541996,1.082267,-1.30703
1,0.28635,4.5,-0.753887
2,0.331286,1.082267,-0.535954
3,0.246674,1.082267,1.004812
4,1.327195,-0.919262,-1.549106
5,0.022185,0.758363,-0.660524
6,0.86258,-0.010032,0.050009


In [63]:
df.mean()

0    0.219182
1    1.082267
2   -0.535954
dtype: float64

In [66]:
%time df.stack().dropna().mean()

CPU times: user 684 µs, sys: 66 µs, total: 750 µs
Wall time: 703 µs


0.15574213849375765

In [67]:
%time np.nanmean(df)

CPU times: user 170 µs, sys: 8 µs, total: 178 µs
Wall time: 174 µs


0.15574213849375773

In [72]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2':[1, 1, 2, 3, 3, 4, 4  ] },
                     index = np.arange(1,8))
data

Unnamed: 0,k1,k2
1,one,1
2,two,1
3,one,2
4,two,3
5,one,3
6,two,4
7,two,4


In [75]:
data.drop_duplicates(['k1'], keep = 'last')

Unnamed: 0,k1,k2
5,one,3
7,two,4


In [78]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned Beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned Beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [77]:
meat2animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [82]:
lowercased = data['food'].str.lower()

In [84]:
data['animal'] = lowercased.map(meat2animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned Beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [86]:
data['food'].map(meat2animal)

0       pig
1       pig
2       pig
3       cow
4       NaN
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [87]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned Beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [91]:
def getanimal(x):
    return meat2animal[x.lower()]
data['food'].map(getanimal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [94]:
data['food'].str.lower()

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [95]:
data['food'].str

<pandas.core.strings.accessor.StringMethods at 0x7fc151453340>

In [96]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [98]:
data.replace([-999, -1000] , NA)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [99]:
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                    index=['Ohio', 'Colorado', 'New York'],
                     columns=['one', 'two', 'three', 'four'] )
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [110]:
data.rename(index={'Ohio': 'Indiana'},
            columns={'three': 'peekabo'},
            inplace=True)


In [108]:
data

Unnamed: 0,one,two,peekabo,four
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [109]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [111]:
def transform(x):
    return x[:4].upper()

data.index.map(transform)

Index(['INDI', 'COLO', 'NEW '], dtype='object')

In [112]:
data

Unnamed: 0,one,two,peekabo,four
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [113]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,PEEKABO,FOUR
Indiana,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [114]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [116]:
age_categories.codes


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [117]:
age_categories.categories


IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [118]:
age_categories.categories[0]


Interval(18, 25, closed='right')

In [119]:
pd.value_counts(age_categories)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [120]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [121]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [122]:
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.042, 0.27], (0.5, 0.72], (0.27, 0.5], (0.5, 0.72], (0.042, 0.27], ..., (0.72, 0.95], (0.72, 0.95], (0.042, 0.27], (0.042, 0.27], (0.27, 0.5]]
Length: 20
Categories (4, interval[float64, right]): [(0.042, 0.27] < (0.27, 0.5] < (0.5, 0.72] < (0.72, 0.95]]

In [123]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats

[(0.616, 3.928], (-2.9499999999999997, -0.684], (-0.0309, 0.616], (-0.0309, 0.616], (-2.9499999999999997, -0.684], ..., (-2.9499999999999997, -0.684], (-0.684, -0.0309], (0.616, 3.928], (-0.684, -0.0309], (-0.684, -0.0309]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0309] < (-0.0309, 0.616] < (0.616, 3.928]]

In [124]:
cats.categories

IntervalIndex([(-2.9499999999999997, -0.684], (-0.684, -0.0309], (-0.0309, 0.616], (0.616, 3.928]], dtype='interval[float64, right]')

In [126]:
pd.value_counts(cats)

(-2.9499999999999997, -0.684]    250
(-0.684, -0.0309]                250
(-0.0309, 0.616]                 250
(0.616, 3.928]                   250
Name: count, dtype: int64

In [127]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.0309, 1.287], (-1.187, -0.0309], (-0.0309, 1.287], (-0.0309, 1.287], (-1.187, -0.0309], ..., (-2.9499999999999997, -1.187], (-1.187, -0.0309], (-0.0309, 1.287], (-1.187, -0.0309], (-1.187, -0.0309]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -1.187] < (-1.187, -0.0309] < (-0.0309, 1.287] < (1.287, 3.928]]

In [128]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,1.337508,-0.416584,0.329313,-0.732599
1,0.137889,-0.719672,1.927640,-0.315813
2,-1.869341,-0.428579,-0.017905,-1.720285
3,0.010867,0.824170,-0.489180,1.832492
4,1.658668,0.454105,-0.194964,-0.976914
...,...,...,...,...
995,-0.457931,0.240360,-1.839647,0.578781
996,-1.365114,0.117161,-0.138244,-1.251215
997,-1.088757,-2.093583,-0.111520,-0.475629
998,0.595405,1.017575,0.345292,1.588491


In [133]:
col = data[2 ]
col[col > 2]

50     2.463238
52     2.735527
83     2.389645
130    2.268799
139    2.377020
166    2.424667
177    2.152572
197    2.094259
219    2.242465
313    2.213145
328    2.036981
407    2.194551
532    2.043916
610    2.275784
749    2.221307
764    2.611678
810    2.044813
880    2.482286
979    2.238196
Name: 2, dtype: float64

In [140]:
data[(data > 2).any(axis=1)]

Unnamed: 0,0,1,2,3
11,0.317635,2.458842,-0.878777,0.850684
25,2.013067,-0.543725,-0.654147,-0.496376
50,-0.284079,-0.708196,2.463238,0.217453
52,-0.707213,-0.560107,2.735527,0.927335
55,1.951312,3.260383,0.963301,1.201206
...,...,...,...,...
965,1.547291,0.548424,-0.284581,2.052463
979,0.175056,-1.029444,2.238196,-0.928520
983,1.480221,-0.125103,-1.033043,2.445125
985,-0.752661,1.163600,-0.071575,2.497899


In [141]:
col = data[2]
col[col.abs() > 3]

36    -3.399312
131   -3.745356
Name: 2, dtype: float64

In [158]:
data.loc[:, (data > 2 ).any()]

Unnamed: 0,0,1,2,3
0,1.337508,-0.416584,0.329313,-0.732599
1,0.137889,-0.719672,1.927640,-0.315813
2,-1.869341,-0.428579,-0.017905,-1.720285
3,0.010867,0.824170,-0.489180,1.832492
4,1.658668,0.454105,-0.194964,-0.976914
...,...,...,...,...
995,-0.457931,0.240360,-1.839647,0.578781
996,-1.365114,0.117161,-0.138244,-1.251215
997,-1.088757,-2.093583,-0.111520,-0.475629
998,0.595405,1.017575,0.345292,1.588491


In [159]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.002332,-0.002111,-0.057986,-0.049125
std,1.040006,1.017091,0.979317,1.00222
min,-3.183867,-3.481593,-3.194414,-3.108915
25%,-0.739557,-0.696704,-0.756741,-0.736424
50%,0.016668,0.034523,-0.055401,-0.037073
75%,0.700698,0.702548,0.616784,0.603749
max,3.18994,2.961194,3.02372,2.916153


In [163]:
col = data[2]
col

0      0.299675
1     -0.161442
2     -0.935589
3     -1.131414
4     -0.369671
         ...   
995    0.253912
996    0.433294
997    0.457039
998   -0.798525
999   -0.801394
Name: 2, Length: 1000, dtype: float64

In [161]:
data.head()

Unnamed: 0,0,1,2,3
0,1.442611,0.34231,0.299675,-0.550621
1,1.809901,0.025721,-0.161442,2.771398
2,0.460863,1.694641,-0.935589,0.766984
3,-2.040757,-0.882594,-1.131414,0.490516
4,-0.032514,0.51958,-0.369671,-0.463248


In [166]:
data[np.abs(data) > 3] = np.sign(data) *3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.002261,-0.001125,-0.05777,-0.049016
std,1.038181,1.013995,0.978503,1.001893
min,-3.0,-3.0,-3.0,-3.0
25%,-0.739557,-0.696704,-0.756741,-0.736424
50%,0.016668,0.034523,-0.055401,-0.037073
75%,0.700698,0.702548,0.616784,0.603749
max,3.0,2.961194,3.0,2.916153


In [167]:
df = pd.DataFrame(np.arange(5*4).reshape(5,4))
sampler = np.random.permutation(5)
sampler

array([1, 4, 2, 0, 3])

In [168]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [169]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [170]:
df.sample(3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7


In [172]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)

In [174]:
draws

4    4
1    7
3    6
2   -1
0    5
2   -1
3    6
0    5
4    4
2   -1
dtype: int64

In [176]:
df = pd.DataFrame({'key': list('bbacab'), 
                   'data1':range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [177]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [178]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [179]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


In [180]:
dummies.join(df[['data1']])

Unnamed: 0,key_a,key_b,key_c,data1
0,False,True,False,0
1,False,True,False,1
2,True,False,False,2
3,False,False,True,3
4,True,False,False,4
5,False,True,False,5


In [181]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [183]:
all_genres = []
for lst in movies['genres']:
    all_genres.extend(lst.split("|"))
genres = pd.unique(all_genres)

In [186]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [188]:
for i, lst in enumerate(movies['genres']):
    indices = dummies.columns.get_indexer(lst.split("|"))
    dummies.iloc[i, indices] = 1

In [None]:
# 创建一个索引对象
index = pd.Index(['a', 'b', 'c', 'd', 'e'])

# 定义目标索引
target = ['b', 'c', 'x']

# 使用get_indexer方法获取位置
indexer = index.get_indexer(target)

print(indexer)

In [189]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
movies_windic =  movies.join(dummies.add_prefix('Geres_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Geres_Animation                              1.0
Geres_Children's                             1.0
Geres_Comedy                                 1.0
Geres_Adventure                              0.0
Geres_Fantasy                                0.0
Geres_Romance                                0.0
Geres_Drama                                  0.0
Geres_Action                                 0.0
Geres_Crime                                  0.0
Geres_Thriller                               0.0
Geres_Horror                                 0.0
Geres_Sci-Fi                                 0.0
Geres_Documentary                            0.0
Geres_War                                    0.0
Geres_Musical                                0.0
Geres_Mystery                                0.0
Geres_Film-Noir                              0.0
Geres_Western       

In [194]:

df = pd.DataFrame(
    [
        [1000, "male", 23],
        [1001, "female", 22],
        [1002, "male", 69]
    ],
    columns=['id', 'gender', 'age']
).set_index('id')
df

Unnamed: 0_level_0,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,male,23
1001,female,22
1002,male,69


In [195]:
dummy_df = pd.get_dummies(df['gender'])

In [202]:
df = pd.concat([df, dummy_df], axis=1)

In [205]:
df.drop('gender', axis=1, inplace=True)

In [None]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 25
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 82
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [None]:
import numpy as np
import pandas as pd

In [None]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64