# __Introduction to Python for Data Science__
## _CSE Mentor Program - University of Colorado, Denver. Spring-2019_

This workshop is intended to introduce Python to Undergrad and Graduate students in the context of Data Science techniques. 

During three sessions we will covering the basis of the Python Language, the use of Pandas to access and manipulate data and the Scikit-Learn library to do some basic analysis. 

# Session 2 - Introduction to NumPy and MatplotLib
In this session we will focus on NumPy, a library designed for manage numeric data, and Matplotlib, a library designed for plotting datasets  

<hr/>


In [None]:
import numpy as np   #After running this line np will reference the numpy class


import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as mplstyle


import seaborn as sns

# NumPy
This class allows to manipulate large single or multi-dimensional arrays of numbers. 

To use numpy, we need to import the library package.


In [None]:
a = np.array([1,2,3,4])
print("Array a:",a)
print("a is of type", type(a))
print("a elements data type is:",a.dtype)

In [None]:
a = np.array([1.0,2,3,4])
print("Array a:",a)
print("a is of type", type(a))
print("a elements data type is:",a.dtype)
print("\n>>> Note the difference between a in this cell and a in the previous one <<<")

In [None]:
a = np.array([(1,2),(3,4),(5,6),(7,8)])
print ("Matrix A:")
print(a)
print()
print("A has type:",type(a))
print("A datatype is:",a.dtype)
print("A shape is :",a.shape)
print()
print("First Row:",a[0], type(a[0]))
print("First Element:",a[0][0], type(a[0][0]))

In [None]:
zeros=np.zeros((2,3))
print("Zeros Matrix")
print(zeros)         #2 rows, 3columns
print("zeros's shape:",zeros.shape)
print()
ones=np.ones((2,3))
print("Ones Matrix")
print(ones)
print("ones's shape:",ones.shape)
print()
empty=np.empty( (5,9) )
print("Un-Initialized Matrix")
print(empty)   #try new dimensions
print("empty's shape:",empty.shape)
print()
multi_dim = np.ones((5,4,3))
print("Multi-Dimensional Matrix")
print(multi_dim)   #try new dimensions
print("multi_dim's shape:",multi_dim.shape)
print()
two_dim = multi_dim.reshape(2,30)
print("Multi-Dimensional Matrix Re-shaped as (2,30)")
print(two_dim)   #try new dimensions
print("two_dim's shape:",two_dim.shape)
print()
for i in range(2):
    for j in range(30):
        two_dim[i][j]=(30*i)+(j+1)
print("New Two Dimensional")
print(two_dim)   #try new dimensions
print("Back to multi-dimensional")
print(two_dim.reshape(5,4,3))

#### Playing with $\pi$

In [None]:
print(np.pi)

In [None]:
linear_space=np.linspace( 0, 2, 9 )
print("Linear space of 9 numbers from 0 to 2", linear_space )
print(linear_space*np.pi)

In [None]:
x = np.linspace( 0, 2*pi, 20 ) #20 values from 0 to 2Pi
y = np.sin(x)   #Compute the funcion sin(x) for the values of x
print("\nX valueS:\n",x)
print("\nsin(x) values:\n",y)
print("\n(x,y) values for (x,sin(x)):\n",np.array(list(zip(x,y))))
plt.plot(x,y)

In [None]:
x = np.arange(100)
print(x)
y = x.reshape(10,10)
print("\nArray re-shaped as a 10x10 matrix:")
print(y)

In [None]:
A = np.arange(25).reshape(5,5)
B = np.arange(25).reshape(5,5)
print(A)
print()
print(B)
print()
print("A squared (element wise):\n",A**2)
print()
print("Element wise product:\n",A*B)
print()
print("Dot product:\n",A.dot(B))

In [None]:
A = np.arange(25).reshape(5,5)
B = np.arange(5).reshape(5)
print("Matrix A")
print(A)
print()
print("Vector B")
print(B)
print()
print("A.B")
print(A.dot(B))

In [None]:
A = np.random.random((2,3))
print("Random Matrix A")
print(A)
print()
print("Sum of elements of A:",A.sum())
print("Sum of first row of A:",A[0].sum())
print()
print("Number of elements of first row of A:",A[0].size)
print()
print("Max of elements of A:",A.max())
print("Min of elements of A:",A.min())
print()
print("Matrix A")
print(A)
print()
print("Matrix A + 1 element wise")
A += 1
print(A)

print()
print("Matrix A * 2 element wise")
A *= 2
print(A)

# Matplotlib 

In [None]:
def styleReset():
    mpl.rcParams.update(mpl.rcParamsDefault)

## A plot generated by matplotlib has the following components:

<span style="border:5px solid black"><img src="./files/anatomy.png" style="width:50%;border:5px solid black"></span>


In [None]:
styleReset()
x = [10,20,30,40,50]
y = [1,2,6,8,16]

plt.plot(x, y, label='myFunction')

plt.xlabel('x value')
plt.ylabel('y value')
plt.minorticks_on()   #plt.minorticks_off()

plt.title("My plot")

plt.legend()

plt.show()

In [None]:
styleReset()
x = [0,1,2,3,4]
y = [1,2,4,8,16]
z = np.pi*(np.array(x)**2)

plt.scatter(x, y, label='2^n', marker="x")
plt.scatter(x, z, label='area comp', marker="o")

plt.xlabel('x value')
plt.ylabel('y value')

plt.title("My Scatter Plot")

plt.legend()

plt.show()

In [None]:
styleReset()
x = np.linspace(0, 2, 100)

plt.plot(x, x,    label='linear')
plt.plot(x, x**2, label='quadratic')
plt.plot(x, x**3, label='cubic')

plt.xlabel('x value')
plt.ylabel('f(x) value')

plt.title("Function Comparison")

plt.legend()

plt.show()

In [None]:
styleReset()
x = np.linspace(0, 2, 20)
plt.plot(x, x,    "r--", label='linear')
plt.plot(x, x**2, "bs",  label='quadratic')
plt.plot(x, x**3, "g^",  label='cubic')

plt.xlabel('x value')
plt.ylabel('f(x) value')

plt.title("Function Comparison")

plt.legend()

plt.show()

In [None]:
x = np.arange(0, 10, 0.2)
y = np.sin(x)
fig, ax = plt.subplots(1,2)


ax[0].plot(x, y, marker='x')
ax[1].plot(x, np.cos(x), marker='o')

plt.show()

In [None]:

mplstyle.use(['dark_background', 'ggplot', 'fast'])

x = np.arange(0, 10, 0.2)
y = np.sin(x)
fig, ax = plt.subplots(1,2)

ax[0].plot(x, y, marker='x')
ax[1].plot(x, np.cos(x), marker='o')


plt.show()

styleReset()

In [None]:
styleReset()
mplstyle.use([ 'ggplot', 'fast'])

data = {'a': np.arange(50),
        'c': np.random.randint(0, 50, 50),
        'd': np.random.randn(50)}

data['b'] = data['a'] + 10 * np.random.randn(50)

data['d'] = np.abs(data['d']) * 100

plt.scatter('a', 'b', c='c', s='d', data=data)  #C: color s:Scale
plt.xlabel('entry a')
plt.ylabel('entry b')
plt.show()


In [None]:
styleReset()

names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(1, figsize=(9,3))     #Figure of 9inches wide, 3 inches tall.

plt.subplot(131)   #1 row, 3 columns, 1st plot
plt.bar(names, values)
plt.subplot(132)   #1 row, 3 columns, 2nd plot
plt.scatter(names, values)
plt.subplot(133)   #1 row, 3 columns, 3rd plot
plt.plot(names, values)

plt.suptitle('Categorical Plotting')
plt.show()


In [None]:
import matplotlib.gridspec as gridspec

fig1 = plt.figure(num=1, figsize=(50,20))
cols = 3
rows = 2


x = np.linspace(0.1,100,200)
y = []; names = []
y.append(np.sin(x)); names.append("sin(x)")
y.append(np.cos(x)); names.append("cos(x)")
y.append(np.tan(x)); names.append("tan(x)")
y.append(np.log(x)); names.append("log(x)")
y.append(np.log10(x)); names.append("log10(x)")
y.append(np.log2(x)); names.append("log2(x)")


gs = gridspec.GridSpec(rows, cols)

ax = []
for i in range(6):
    row = (i // cols)
    col = i % cols
    ax.append(fig1.add_subplot(gs[row, col]))
    ax[-1].set_title(names[i])
    ax[-1].plot(x, y[i])
    #plt.yscale('symlog')

### Mark points on the line.
the plot argument markevery allow to add a mark every some number of elements. 

__markevery__: None or int or (int, int) or slice or List[int] or float or (float, float)
Which markers to plot.

- every=None, every point will be plotted.
- every=N, every N-th marker will be plotted starting with marker 0.
- every=(start, N), every N-th marker, starting at point start, will be plotted.
- every=slice(start, end, N), every N-th marker, starting at point start, up to but not including point end, will be plotted.
- every=[i, j, m, n], only markers at points i, j, m, and n will be plotted.
- every=0.1, (i.e. a float) then markers will be spaced at approximately equal distances along the line; the distance along the line between markers is determined by multiplying the display-coordinate distance of the axes bounding-box diagonal by the value of every.
- every=(0.5, 0.1) (i.e. a length-2 tuple of float), the same functionality as every=0.1 is exhibited but the first marker will be 0.5 multiplied by the display-cordinate-diagonal-distance along the line.



In [None]:
styleReset()
# define a list of markevery cases to plot
cases = [None,
         8,
         (30, 8),
         [16, 24, 30], [0, -1],
         slice(100, 200, 3),
         0.1, 0.3, 1.5,
         (0.0, 0.1), (0.45, 0.1)]

# define the figure size and grid layout properties
figsize = (20, 16)
cols = 3
gs = gridspec.GridSpec(len(cases) // cols + 1, cols)
gs.update(hspace=0.4)

# define the data for cartesian plots
delta = 0.11
x = np.linspace(0, 10 - 2 * delta, 200) + delta
y = np.sin(x) + 1.0 + delta

In [None]:
fig1 = plt.figure(num=1, figsize=figsize)
ax = []
for i, case in enumerate(cases):
    row = (i // cols)
    col = i % cols
    ax.append(fig1.add_subplot(gs[row, col]))
    ax[-1].set_title('markevery=%s' % str(case))
    ax[-1].plot(x, y, 'o', ls='-', ms=4, markevery=case)

In [None]:
fig2 = plt.figure(num=2, figsize=figsize)
axlog = []
mplstyle.use(['dark_background', 'ggplot'])

for i, case in enumerate(cases):
    row = (i // cols)
    col = i % cols
    axlog.append(fig2.add_subplot(gs[row, col]))
    axlog[-1].set_title('markevery=%s' % str(case))
    axlog[-1].set_xscale('log')
    axlog[-1].set_yscale('log')
    axlog[-1].plot(x, y, 'o', ls='-', ms=4, markevery=case)



In [None]:
fig3 = plt.figure(num=4, figsize=figsize)

# define data for polar plots
r = np.linspace(0, 3.0, 200)
theta = 2 * np.pi * r

axpolar = []
for i, case in enumerate(cases):
    row = (i // cols)
    col = i % cols
    axpolar.append(fig3.add_subplot(gs[row, col], projection='polar'))
    axpolar[-1].set_title('markevery=%s' % str(case))
    axpolar[-1].plot(theta, r, 'o', ls='-', ms=4, markevery=case)

plt.show()

In [None]:
styleReset()

# SeaBorn
Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing __attractive and informative statistical graphics__.

https://seaborn.pydata.org

In [None]:
#Load fmri (Functional magnetic resonance imaging) dataset
fmri = sns.load_dataset("fmri")

In [None]:
fmri[90:100]

## Relationship plots
Statistical analysis is a process of understanding how variables in a dataset relate to each other and how those relationships depend on other variables. 
Visualization can be a core component of this process because, when data are visualized properly, the human visual system can see trends and patterns that indicate a relationship.

In [None]:
# Plotting subsets of data with semantic mappings
sns.relplot(x="timepoint", y="signal", hue="region", style="event", kind="line", data=fmri);

## Categorical Plot

In [None]:
#Tips Dataset
tips = sns.load_dataset("tips")
tips[:10]

In [None]:

sns.catplot(x="day", y="total_bill", data=tips);

In [None]:
sns.catplot(x="day", y="total_bill", kind="swarm", data=tips);

In [None]:
sns.catplot(x="day", y="total_bill", hue="sex", kind="swarm", data=tips);

In [None]:
sns.catplot(x="size", y="total_bill", kind="swarm", data=tips.query("size != 3"));

In [None]:
sns.catplot(x="smoker", y="tip", order=["No", "Yes"], data=tips);

## Distribution Plots

### BoxPlots
This kind of plot shows the three quartile values of the distribution along with extreme values


In [None]:

sns.catplot(x="day", y="total_bill", kind="box", data=tips);

In [None]:
sns.catplot(x="day", y="total_bill", hue="smoker", kind="box", data=tips);

In [None]:
sns.catplot(x="sex", y="total_bill", hue="time", kind="bar", data=tips);

### Plotting univariate distributions

In [None]:
sns.distplot(tips.tip);

### Plotting bivariate distributions¶

In [None]:
sns.jointplot(x="tip", y="total_bill", data=tips);

In [None]:
with sns.axes_style("white"):
    sns.jointplot(x=tips.tip, y=tips.total_bill, kind="hex", color="k");

In [None]:
sns.jointplot(x="tip", y="total_bill", data=tips, kind="kde");

In [None]:
g = sns.jointplot(x="tip", y="total_bill", data=tips, kind="kde", color="m")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$");

### Visualizing pairwise relationships in a dataset
#### IRIS Dataset

The iris dataset is a well-known dataset for data analysis and machine learning. https://archive.ics.uci.edu/ml/datasets/iris

The dataset describes several characteristics of Irises (flower) and the type of the corresponding sample.

<img width=30% src="./files/flower-labelled_med.jpeg">

1. sepal length in cm 
2. sepal width in cm 
3. petal length in cm 
4. petal width in cm 
5. class: 
  - Iris Setosa 
  - Iris Versicolour 
  - Iris Virginica

The content looks as follows:


In [None]:
iris = sns.load_dataset("iris")
iris[45:55]

In [None]:
iris = sns.load_dataset("iris")
sns.pairplot(iris);

In [None]:
g = sns.PairGrid(iris)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);