# Chapter 5 - Basic Math and Statistics

In [None]:
import numpy as np
from numpy.random import randn

## Segment 1 - Using NumPy to perform arithmetic operations on data

### Creating arrays

In [9]:
np.set_printoptions(precision=2)

#### Creating arrays using a list

In [10]:
a = np.array([1,2,3,4,5,6])
a

array([1, 2, 3, 4, 5, 6])

In [11]:
b = np.array([[10,20,20],[40,50,60]])
b

array([[10, 20, 20],
       [40, 50, 60]])

#### Creating arrays via assignment

In [12]:
np.random.seed(25)
c = 36*np.random.randn(6)
c

array([  8.22,  36.97, -30.23, -21.28, -34.45,  -8.  ])

In [13]:
d = np.arange(1, 35)
d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])

### Performing arthimetic on arrays

In [16]:
print(a*10)
print(c + a)
print(c-a)
print(c*a)
print(c/a)

[10 20 30 40 50 60]
[  9.22  38.97 -27.23 -17.28 -29.45  -2.  ]
[  7.22  34.97 -33.23 -25.28 -39.45 -14.  ]
[   8.22   73.94  -90.68  -85.13 -172.24  -48.02]
[  8.22  18.48 -10.08  -5.32  -6.89  -1.33]


In [20]:
### Multiplying matrices and basic linear algebra
aa = np.array([[2.,4.,6.],[1.,3.,5.],[10.,20.,30.]])
print(aa)
print('-------------------')
bb = np.array([[0.,1.,2.],[3.,4.,5.],[6.,7.,8.]])
print(bb)


[[ 2.  4.  6.]
 [ 1.  3.  5.]
 [10. 20. 30.]]
-------------------
[[0. 1. 2.]
 [3. 4. 5.]
 [6. 7. 8.]]


In [22]:
print(aa*bb)

[[  0.   4.  12.]
 [  3.  12.  25.]
 [ 60. 140. 240.]]


In [23]:
np.dot(aa,bb)

array([[ 48.,  60.,  72.],
       [ 39.,  48.,  57.],
       [240., 300., 360.]])

## Segment 2 - 

In [None]:
# Chapter 5 - Basic Math and Statistics
## Segment 2 - Multiplying matrices and basic linear algebra
import numpy as np
from numpy.random import randn
np.set_printoptions(precision=2)
## Multiplying matrices and basic linear algebra
aa = np.array([[2.,4.,6.],[1.,3.,5.],[10.,20.,30.]])
aa
bb = np.array([[0.,1.,2.],[3.,4.,5.],[6.,7.,8.]])
bb
aa*bb
np.dot(aa,bb)


## Segment 3 - 

In [None]:
# Chapter 5 - Basic Math and Statistics
## Segment 3 - Generating summary statistics using pandas and scipy
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy import stats
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/mtcars.csv'

cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']

cars.head()
### Looking at summary statistics that decribe a variable's numeric values
cars.sum()
cars.sum(axis=1)
cars.median()
cars.mean()
cars.max()
mpg = cars.mpg
mpg.idxmax()
### Looking at summary statistics that describe variable distribution
cars.std()
cars.var()
gear = cars.gear
gear.value_counts()
cars.describe()


## Segment 4 - 

In [None]:
# Chapter 5 - Basic Math and Statistics
## Segment 4 - Summarizing categorical data using pandas
import numpy as np
import pandas as pd
### The basics
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/mtcars.csv'
cars = pd.read_csv(address)

cars.columns = ['car_names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
cars.index = cars.car_names
cars.head(15)
carb = cars.carb
carb.value_counts()
cars_cat = cars[['cyl','vs','am','gear','carb']]
cars_cat.head()
gears_group = cars_cat.groupby('gear')
gears_group.describe()
### Transforming variables to categorical data type
cars['group'] = pd.Series(cars.gear, dtype="category")
cars['group'].dtypes
cars['group'].value_counts()
### Describing categorical data with crosstabs
pd.crosstab(cars['am'], cars['gear'])


## Segment 5 - 

In [None]:
# Chapter 5 - Basic Math and Statistics

## Segment 5 - Starting with parametric methods in pandas and scipy
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sb
from pylab import rcParams

import scipy
from scipy.stats.stats import pearsonr
%matplotlib inline
rcParams['figure.figsize'] = 8,4
plt.style.use('seaborn-whitegrid')
### The Pearson Correlation
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/mtcars.csv'

cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']
sb.pairplot(cars)
X = cars[['mpg', 'hp', 'qsec', 'wt']]
sb.pairplot(X)
### Using scipy to calculate the Pearson correlation coefficient
mpg = cars['mpg']
hp = cars['hp']
qsec = cars['qsec']
wt = cars['wt']

pearsonr_coefficient, p_value = pearsonr(mpg, hp)
print('PeasonR Correlation Coefficient %0.3f'% (pearsonr_coefficient))
pearsonr_coefficient, p_value = pearsonr(mpg, qsec)
print('PeasonR Correlation Coefficient %0.3f'% (pearsonr_coefficient))
pearsonr_coefficient, p_value = pearsonr(mpg, wt)
print('PeasonR Correlation Coefficient %0.3f'% (pearsonr_coefficient))
### Using pandas to calculate the Pearson correlation coefficient
corr = X.corr()
corr
### Using Seaborn to visualize the Pearson correlation coefficient
sb.heatmap(corr, xticklabels=corr.columns.values, yticklabels= corr.columns.values)


## Segment 6 - 

In [None]:
# Chapter 5 - Basic Math and Statistics
## Segment 6 - Delving into non-parametric methods using pandas and scipy
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb
from pylab import rcParams

import scipy
from scipy.stats import spearmanr
%matplotlib inline
rcParams['figure.figsize'] = 14, 7
plt.style.use('seaborn-whitegrid')
### The Spearman Rank Correlation
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/mtcars.csv'

cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
cars.head()
sb.pairplot(cars)
X = cars[['cyl', 'vs', 'am', 'gear']]
sb.pairplot(X)
cyl = cars['cyl']
vs = cars['vs']
am = cars['am']
gear = cars['gear']

spearmanr_coefficient, p_value = spearmanr(cyl, vs)

print('Spearman Rank Correlation Coefficient %0.3f' % (spearmanr_coefficient))
spearmanr_coefficient, p_value = spearmanr(cyl, am)

print('Spearman Rank Correlation Coefficient %0.3f' % (spearmanr_coefficient))
spearmanr_coefficient, p_value = spearmanr(cyl, gear)

print('Spearman Rank Correlation Coefficient %0.3f' % (spearmanr_coefficient))
### Chi-square test for independence
table = pd.crosstab(cyl, am)

from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(table.values)
print ('Chi-square statistic %0.3f p_value %0.3f' % (chi2, p))
table = pd.crosstab(cyl, vs)

from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(table.values)
print ('Chi-square statistic %0.3f p_value %0.3f' % (chi2, p))
table = pd.crosstab(cyl, gear)

from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(table.values)
print ('Chi-square statistic %0.3f p_value %0.3f' % (chi2, p))


## Segment 7 - 

In [None]:
# Chapter 5 - Basic Math and Statistics
## Segment 7 - Transforming dataset distributions
import numpy as np
import pandas as pd
import scipy

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
sb.set_style('whitegrid')
### Normalizing and transforming features with MinMaxScalar() and fit_transform()
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/mtcars.csv'

cars = pd.read_csv(address)
cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
mpg = cars.mpg
plt.plot(mpg)
cars[['mpg']].describe()
mpg_matrix = mpg.values.reshape(-1,1)

scaled = preprocessing.MinMaxScaler()

scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
scaled = preprocessing.MinMaxScaler(feature_range=(0,10))

scaled_mpg = scaled.fit_transform(mpg_matrix)
plt.plot(scaled_mpg)
### Using scale() to scale your features
standardized_mpg = scale(mpg, axis=0, with_mean=False, with_std=False)
plt.plot(standardized_mpg)
standardized_mpg = scale(mpg)
plt.plot(standardized_mpg)
http://goo.gl/tuEWkD

## Segment 8 - 

In [None]:
# Chapter 5 - Outlier Analysis
## Segment 8 - Extreme value analysis using univariate methods
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 5,4
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/iris.data.csv'
df = pd.read_csv(filepath_or_buffer=address, header=None, sep=',')

df.columns=['Sepal Length','Sepal Width','Petal Length','Petal Width', 'Species']
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values
df[:5]
### Identifying outliers from Tukey boxplots
df.boxplot(return_type='dict')
plt.plot()
Sepal_Width = X[:,1]
iris_outliers = (Sepal_Width > 4)
df[iris_outliers]
Sepal_Width = X[:,1]
iris_outliers = (Sepal_Width < 2.05)
df[iris_outliers]
### Applying Tukey outlier labeling
pd.options.display.float_format = '{:.1f}'.format
X_df = pd.DataFrame(X)
print(X_df.describe())


## Segment 9 - 

In [None]:
# Chapter 5 - Outlier Analysis
## Segment 8 - Extreme value analysis using univariate methods
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 5,4
address = 'C:/Users/Lillian/Desktop/ExerciseFiles/Data/iris.data.csv'
df = pd.read_csv(filepath_or_buffer=address, header=None, sep=',')

df.columns=['Sepal Length','Sepal Width','Petal Length','Petal Width', 'Species']
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values
df[:5]
### Identifying outliers from Tukey boxplots
df.boxplot(return_type='dict')
plt.plot()
Sepal_Width = X[:,1]
iris_outliers = (Sepal_Width > 4)
df[iris_outliers]
Sepal_Width = X[:,1]
iris_outliers = (Sepal_Width < 2.05)
df[iris_outliers]
### Applying Tukey outlier labeling
pd.options.display.float_format = '{:.1f}'.format
X_df = pd.DataFrame(X)
print(X_df.describe())
