In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-talk')

## Introduction to pandas DataFrames: manipulating and exploring data

Pandas is a Python package for easy to use data structures and analysis tools. The main tool it uses is the pandas DataFrame, which is very similar to R's data.frame and ideal for data exploration

In [None]:
# Load in a dataset that measured participants' IQ and brain size, among some other characteristics
data = pd.read_csv('data/brain_size.csv', sep=';', na_values='.')

In [None]:
# The head() function allows you to inspect the first few entries in your dataframe. 
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# DataFrames make it easy to subselect some of the data based on variable names. 
females = data[data['Gender']=='Female']
females.head()

In [None]:
# And to get some descriptive statistics out
print('Mean female IQ: ')
print( females['VIQ'].mean() )

print('Standard devation: ')
print(females['VIQ'].std())

In [None]:
# The groupby method allows you to extract characteristics grouped by categorical variables. For example: the mean
# of all continuous variables grouped by gender:
data.groupby('Gender').mean()

In [None]:
# You can easily create DataFrames from Numpy arrays:
random_data = np.random.rand(20,2)
data2 = pd.DataFrame(random_data, columns=['Height', 'Weight'])
data2.head()

In [None]:
# And add other columns to your DataFrame, with a different datatype
grades = ['A', 'B'] * 10
data2['Grade'] = grades

In [None]:
data2.tail(3)

In [None]:
# You can change the index to your liking, for example here I'll reverse the index order
data2.index = np.arange(20)[::-1]

In [None]:
data2.head()

In [None]:
# iloc gives the location (like a numpy array)
data2.iloc[3]

In [None]:
data2.loc[18]

## Plotting using pandas

In [None]:
from pandas.tools import plotting


In [None]:
# The scatter matrix gives you a nice way to explore relations in your data (diagonals show histograms)
plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count', 'PIQ', 'FSIQ', 'VIQ']])
plt.show()

# Hypothesis testing using scipy

scipy.stats is the go-to stats package for Python. It contains a large number of probability distributions, as well as a growing library of statistical functions and tests. 

In [None]:
from scipy import stats


In [None]:
# Is the mean verbal IQ different from 100? the 1-sample t-test
stats.ttest_1samp(data['VIQ'], 100)

In [None]:
# Are the IQs of males and females different from each other? 
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)

In [None]:
# Paired or dependent samples t-test for non-independent variables (here we compare different measures 
# of IQ within the same individuals)
stats.ttest_rel(data['FSIQ'], data['PIQ'])

The Wilcoxon sign test signed rank test is a close sibling of the dependent samples t-test.  Because the dependent samples t-tests analyzes if the average difference of two repeated measures is zero; it requires metric (interval or ratio) and normally distributed data; the Wilcoxon sign test uses ranked or ordinal data.  Thus it is a common alternative to the dependent samples t-test when its assumptions are not met.

In [None]:
# The dependent samples t-test assumes normally distributed data. Scipy also has the standard non-parametric tests:
stats.wilcoxon(data['FSIQ'], data['PIQ'])

In [None]:
# Scipy.stats contains a wealth of probability distributions. 

from scipy.stats import invweibull
fig, ax = plt.subplots(1,1)
c = 10.6

x = np.linspace(invweibull.ppf(0.01, c), invweibull.ppf(0.99, c), 100)
ax.plot(x, invweibull.pdf(x, c),'k', lw=5, alpha=0.6, label='invweibull pdf')
plt.show()

# Statsmodels: 
## Linear models, ANOVA etc 

Statsmodels is a module for the estimation of many different statistical models, conducting statistical tests and statistical data exploration. It is designed to work easily with Pandas dataframes and has an R-like syntax for defining the models. It has scipy.stats as a dependency and is complementatry to it rather than a substitute.

In [None]:
# Let's go back to our IQ data set:
data.head()

In [None]:
from statsmodels.formula.api import ols

In [None]:
# Ordinary least squares: 
# In statsmodels, we define the model using a formula like in R. Here, we try to predict IQ from Height and log Weight
model = ols("FSIQ ~ MRI_Count", data).fit()

In [None]:
model.summary()

In [None]:
# Similarly you can use the log of the MRI count like this:
model = ols("FSIQ ~ Weight + np.log(MRI_Count)", data).fit()
model.summary()

## Categorical data: comparing groups or multiple categories

In [None]:
# Statsmodels can automatically infer a categorical variable. If you want to interpret a column of integers
# you can force it to be categorical using C(). 
model = ols("VIQ ~ C(Gender)", data).fit()

In [None]:
model.summary()

## Multiple regression

In [None]:
# Load in the iris data set (petal and sepal sizes of 3 different types of Iris flower)
iris_data = pd.read_csv('data/iris.csv')

In [None]:
iris_data.head()

In [None]:
# the plotting.scatter_matrix method allows you to plot the different categories in your data as different colours
# using the pandas.Categorical class as an entry in the 'color' keyword argument. 
categories = pd.Categorical(iris_data['name'])
categories

In [None]:
# That way, we can plot our variables in separate colours for the different flower types
plotting.scatter_matrix(iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], c=categories.labels)
plt.show()

In [None]:
# Statsmodels allows you to define a multiple regression model with R syntax like this:
model = ols('sepal_width ~ name + petal_length + sepal_length', iris_data).fit()

In [None]:
model.summary()

In [None]:
# We can do post-hoc testing for specific differences between our multiple predictors. 
# You can formulate contrasts with a list. Here we check the difference between versicolor and virginica:
model.f_test([0, 1, -1, 0, 0])

## Testing for interactions

In [None]:
# Testing for interactions is as simple as using the multiplication symbol in defining your model
# This way, it will test for main effects and interaction. 
model = ols('sepal_width ~ name + petal_length * petal_width', iris_data).fit()

In [None]:
model.summary()

## Analysis of variance (ANOVA)

In [None]:
import statsmodels.api as sm
table = sm.stats.anova_lm(model, typ=2)
table

## (By the way) Seaborn: easy visualisation and simple statistical fitting from pandas dataframes

In [None]:
sns.pairplot(iris_data, vars=['sepal_length','sepal_width','petal_length','petal_width'], kind='reg', diag_kind='kde')
plt.show()

In [None]:
sns.pairplot(iris_data, vars=['sepal_length','sepal_width','petal_length','petal_width'], kind='reg', hue='name')
plt.show()

# rpy2 : Using R in python

## Installation
Installing on Ubuntu is super easy: install R with apt-get and pip install rpy2 

For Mac I needed to install R through homebrew instead of the normal way (downloading from CRAN) in order for rpy2 to be able to find R. However, the homebrew version of R doesn't support X11, which is used for graphics. 

A workaround: 
- install X11 through homebrew:

```
brew cask install xquartz
```
- Install R with X11 support through this user's repo:

```
brew tap randy3k/r
brew install r-x11
```
- Install rpy2 through pip:

```
pip install rpy2 
```


## Evaluating R code

In [None]:
import rpy2.robjects as robjects

In [None]:
# robjects is your communication channel between Python and R. Basically, all rpy2 does is interface with an R
# workspace and your python workspace. 
#help(robjects)

In [None]:
# You can use the square brackets [] to get items from your R object, like the method __getitem__()
pi = robjects.r['pi']
pi


In [None]:
# The result is an R vector, not a scalar. You can index it using the normal python way.
pi[0]

In [None]:
# However the object is also callable. Any command that you pass through robjects.r('') will be interpreted as R code
# For example:
pi = robjects.r('pi')
pi[0]

In [None]:
# Here, we first define a function in r, and then call it with the argument 3, which just gives us a float vector.
robjects.r('''
        f <- function(r, verbose=FALSE) {
            if (verbose) {
                cat("I am calling f().\n")
            }
            2 * pi * r
        }
        f(3)
        ''')

In [None]:
# however, in the r workspace, that function still exists, and we can make it accessible using our [] getitem method:
r_f = robjects.r['f']

In [None]:
# We can now use this as a regular python function:
r_f(4)

In [None]:
r_f

In [None]:
# And we can see its R representation:
print(r_f.r_repr())

In [None]:
# watch out: because R represents everything as vectors, rather than scalars, we need to be careful.
# The following code doesn't do quite what you'd expect:
robjects.r['pi'] + 2 

In [None]:
# R String vectors: 
res = robjects.StrVector(['abc', 'def'])
print(res.r_repr())

In [None]:
# Int vectors:
res = robjects.IntVector([1, 2, 3])
print(res.r_repr())

In [None]:
# Float vectors
res = robjects.FloatVector([1.1, 2.2, 3.3])
print(res.r_repr())

In [None]:
# R matrixes and arrays are just vectors with a dim attribute.
# The easiest way to create such objects is to do it through R functions:

In [None]:
v = robjects.FloatVector([1.1, 2.2, 3.3, 4.4, 5.5, 6.6])
m = robjects.r['matrix'](v, nrow = 2)
print(m)

In [None]:
# Calling R functions: 
rsum = robjects.r['sum']
rsum(robjects.IntVector([1,2,3]))[0]

In [None]:
# Getting a function from R also allows you to use all of its keywords
# Note: Arguments are now given in Python style, not R style. For example, a boolean will be 'True', not 'TRUE'
rsort = robjects.r['sort']
res = rsort(robjects.IntVector([1,2,3]), decreasing=True)
print(res.r_repr())

## Graphics and plots

In [None]:
import rpy2.robjects as robjects

r = robjects.r

x = robjects.IntVector(range(10))
y = r.rnorm(10)
r.X11()

r.layout(r.matrix(robjects.IntVector([1,2,3,2]), nrow=2, ncol=2))
r.plot(r.runif(10), y, xlab="runif", ylab="foo/bar", col="red")



In [None]:
from rpy2 import robjects
from rpy2.robjects import Formula, Environment
from rpy2.robjects.vectors import IntVector, FloatVector
from rpy2.robjects.lib import grid
from rpy2.robjects.packages import importr, data
from rpy2.rinterface import RRuntimeError
import warnings

# The R 'print' function
rprint = robjects.globalenv.get("print")
stats = importr('stats')
grdevices = importr('grDevices')
base = importr('base')
datasets = importr('datasets')

grid.activate()


In [None]:
# You can access ggplot like this:
import math, datetime
import rpy2.robjects.lib.ggplot2 as ggplot2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
base = importr('base')

mtcars = data(datasets).fetch('mtcars')['mtcars']

In [None]:
# Standard R dataset containing fuel consumption and 10 aspects of automobile design and performance for 32 cars
mtcars

In [None]:
# We can use ggplot to plot the miles per gallon (mpg) as a function of the weight of a car, separating out categories
# of cars by their number of cylinders. 
pp = ggplot2.ggplot(mtcars) + \
     ggplot2.aes_string(x='wt', y='mpg', col='factor(cyl)') + \
     ggplot2.geom_point() + \
     ggplot2.geom_smooth(ggplot2.aes_string(group = 'cyl'),
                         method = 'lm')
pp.plot()

## Linear models

In R, specifying a linear model is very straightforward. Here's an example 
R code, where we want to predict the weight of a treatment and control group:

```
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
weight <- c(ctl, trt)

anova(lm.D9 <- lm(weight ~ group))

summary(lm.D90 <- lm(weight ~ group - 1))# omitting intercept

```


In [None]:
# we can do the same in rpy2 like this:

from rpy2.robjects import FloatVector
from rpy2.robjects.packages import importr

stats = importr('stats')
base = importr('base')

ctl = FloatVector([4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14])
trt = FloatVector([4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69])
group = base.gl(2, 10, 20, labels = ["Ctl","Trt"])
weight = ctl + trt

robjects.globalenv["weight"] = weight
robjects.globalenv["group"] = group
lm_D9 = stats.lm("weight ~ group")
print(stats.anova(lm_D9))

# omitting the intercept
lm_D90 = stats.lm("weight ~ group - 1")
print(base.summary(lm_D90))


In [None]:
# How do we inspect the results? 
print(lm_D9.names)

In [None]:
# We can extract the R way using rx: 
lm_D9.rx2('coefficients')


In [None]:
# or the Python way
lm_D9[0]

## Principal component analysis in rpy2 

The R code for PCA on some random data is: 
```
m <- matrix(rnorm(100), ncol=5)
pca <- princomp(m)
plot(pca, main="Eigen values")
biplot(pca, main="biplot")
```

In [None]:
# In rpy2 this looks pretty similar
import rpy2.robjects as robjects


r = robjects.r
r.x11()
m = r.matrix(r.rnorm(100), ncol=5)
pca = r.princomp(m)
r.plot(pca, main="Eigen values")


In [None]:
r.x11()
r.biplot(pca, main="biplot")



In [None]:
# Creating multiple subplots

from rpy2.robjects.packages import importr
graphics = importr('graphics')
grdevices = importr('grDevices')
base = importr('base')
stats = importr('stats')

import array

x = array.array('i', range(10))
y = stats.rnorm(10)

grdevices.X11()

graphics.par(mfrow = array.array('i', [2,2]))
graphics.plot(x, y, ylab = "foo/bar", col = "red")

kwargs = {'ylab':"foo/bar", 'type':"b", 'col':"blue", 'log':"x"}
graphics.plot(x, y, **kwargs)


m = base.matrix(stats.rnorm(100), ncol=5)
pca = stats.princomp(m)
graphics.plot(pca, main="Eigen values")
stats.biplot(pca, main="biplot")


## Compare rpy2 and scipy.stats: the wilcoxon test with very few data points

In [None]:
poplar_data = pd.read_csv('data/poplar.csv')
poplar_data = poplar_data.drop('Unnamed: 0', axis=1) 

In [None]:
poplar_data.head()

In [None]:
# only take first 7 observations: 
poplar_data = poplar_data.loc[:7]

In [None]:
# The scipy wilcoxon test uses a normal approximation to compute the p value from the test statistic. 
# This works for sufficient sample sizes, but with small sample sizes it gives problems and has no alternative. 
import scipy.stats as stats
stats.wilcoxon(np.array(poplar_data['August']),np.array(poplar_data['November']), correction=True)

In [None]:
# The equivalent function in R has the option to compute the exact p value instead of using the normal approximation
# It will default to this option if the sample size is smaller than 50. 
# Using rpy2: 
from rpy2 import robjects
from rpy2.robjects import FloatVector

wilcox_r = robjects.r['wilcox.test']

august = FloatVector(poplar_data['August'])
november = FloatVector(poplar_data['November'])
wilcox_r = robjects.r['wilcox.test']

result = wilcox_r(august, november, paired=True, exact=True)



In [None]:
result

In [None]:
# The interface with rpy2 can feel a bit clunky. If you work on a project with rpy2 you might want to 
# wrap your rpy2 commands in functions:

def wilcoxon_r(x, y, **kwargs):
    rx = FloatVector(x)
    ry = FloatVector(y)
    wilcox_r = robjects.r['wilcox.test']
    result = wilcox_r(rx, ry, **kwargs)
    statistic = result[0][0] # TODO: index by name instead
    p_value = result[2][0]
    return statistic, p_value
    

In [None]:
# This way you can use R functionality without having to think about these interfaces
wilcoxon_r(poplar_data['August'], poplar_data['November'], paired=True, exact=True)

# Converting variables from python to R

In [None]:
# Author: Charly

from rpy2.robjects.vectors import Matrix, Array, DataFrame, FloatVector, IntVector, StrVector, ListVector
import numpy as np
from pandas import DataFrame as PdDF
from collections import OrderedDict
known_r_types = Matrix, Array, DataFrame, FloatVector, IntVector, StrVector, ListVector

python_to_r_types = {
   'list': (StrVector, ),
   'dict': (ListVector, ),
   'np_array': (FloatVector, IntVector, Array, Matrix),
   'pandas_df': (DataFrame, )
}
def recursive_r_to_py(data):
   """
   The recursive function to convert from rpy2 objects to native python
   """

   dtype = type(data)
   if dtype in python_to_r_types['dict']:
       return OrderedDict(zip(data.names, [recursive_r_to_py(d) for d in data]))
   elif dtype in python_to_r_types['list']:
       return [recursive_r_to_py(d) for d in data]
   elif dtype in python_to_r_types['np_array']:
       array = np.array(data)
       if array.size == 1:
           return array[0]
       else:
           return array
   elif dtype in python_to_r_types['pandas']:
       return PdDF(data)
   else:
       if is_r_type(data):  # An unknown r class
           raise NotImplementedError('Could not proceed, type {} is not defined.'
                                     'Recognised types are: {}'. format(dtype, known_r_types))
       else:
           return data  # We reached the end of recursion