# basic data manipulation in python 

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
import numpy as np 
from sklearn import preprocessing

In [24]:
# get iris dataset 
data = load_iris()

# get column name 
col_names = data.feature_names

# convert to pd dataframe 
data = pd.DataFrame(data.data)

# change column names 
data.columns = col_names

data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [16]:
# data summary
data.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [18]:
# data shape 
data.shape

(150, 4)

### List Comprehension

In [20]:
# list comprehension
S = [x**2 for x in range(10)]
S

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [22]:
# list comprehension 
M = [x for x in S if x % 2 == 0]
M

[0, 4, 16, 36, 64]

### Apply Function

In [27]:
# pandas df apply function 

# sqrt each element 
data.apply(np.sqrt) 

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,2.258318,1.870829,1.183216,0.447214
1,2.213594,1.732051,1.183216,0.447214
2,2.167948,1.788854,1.140175,0.447214
3,2.144761,1.760682,1.224745,0.447214
4,2.236068,1.897367,1.183216,0.447214


In [35]:
# pandas df apply function 
# axis = 0 = index = each column 
# axis = 1 = column = each row 
data.apply(np.mean, axis=0) # equiv to df.sum(0)

sepal length (cm)    5.843333
sepal width (cm)     3.054000
petal length (cm)    3.758667
petal width (cm)     1.198667
dtype: float64

In [34]:
data.mean(0)

sepal length (cm)    5.843333
sepal width (cm)     3.054000
petal length (cm)    3.758667
petal width (cm)     1.198667
dtype: float64

In [37]:
data.apply(np.mean, axis=1).head() # equiv to df.sum(1)

0    2.550
1    2.375
2    2.350
3    2.350
4    2.550
dtype: float64

### standardize data 

In [41]:
# standardize data 
data_scaled = pd.DataFrame(preprocessing.scale(data))

data_scaled.head()

Unnamed: 0,0,1,2,3
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


In [47]:
# check if standardization occured correctly. 
round(data_scaled.std(axis= 0),0)

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64

In [46]:
round(data_scaled.mean(axis=0),0)

0   -0.0
1   -0.0
2   -0.0
3   -0.0
dtype: float64

### categorical variable

In [59]:
# categorical variable
data_target = pd.Series(load_iris().target)
data_target.unique()

array([0, 1, 2])

In [60]:
# get number of unique elements 
data_target.nunique()

3

In [62]:
# equivalent of R's table() function 
data_target.value_counts()

2    50
1    50
0    50
dtype: int64