# Chapter 13: Introduction to Modeling Libraries in Python

In [68]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy


In [53]:
# !pip3 install patsy
# !pip3 install statsmodels # This install scipy as well
# !pip3 install sklearn

## 13.1 Interfacing Between pandas and Model Code

In [54]:
# numpy array main entry point into modeling code
# use pd. values to get array from df
df = pd.DataFrame(np.random.randn(3,3),
                 columns=list('xyz'))
print(df)
print()
print(df.values)
print()

# if df contains non-numeric column, then returns python object instead of array
# ^actually I think this is wrong in the book?
df2 = df.assign(cat = list('cat'))
print(df2)
print()
print(df2.values)
print()
# Can work after calling .loc()/.iloc() for subset of columns
print(df2.loc[:, ['x', 'y']].values)

          x         y         z
0 -0.676378 -1.060422  1.052266
1 -1.008403 -0.507090  0.884576
2  0.371285  0.247642 -0.236017

[[-0.67637811 -1.06042178  1.05226641]
 [-1.00840329 -0.50709034  0.88457587]
 [ 0.37128546  0.24764216 -0.23601696]]

          x         y         z cat
0 -0.676378 -1.060422  1.052266   c
1 -1.008403 -0.507090  0.884576   a
2  0.371285  0.247642 -0.236017   t

[[-0.6763781063165866 -1.0604217791063804 1.0522664130081765 'c']
 [-1.0084032909085718 -0.5070903429512751 0.8845758699720678 'a']
 [0.37128545818383824 0.24764215946110685 -0.23601696290219076 't']]

[[-0.67637811 -1.06042178]
 [-1.00840329 -0.50709034]
 [ 0.37128546  0.24764216]]


In [55]:
# More crap about dummies
df3 = pd.concat([df2,df2]).reset_index()
df3['cat'] = df3['cat'].astype('category')
print(df3)
print(pd.get_dummies(df3.cat, prefix='category'))

   index         x         y         z cat
0      0 -0.676378 -1.060422  1.052266   c
1      1 -1.008403 -0.507090  0.884576   a
2      2  0.371285  0.247642 -0.236017   t
3      0 -0.676378 -1.060422  1.052266   c
4      1 -1.008403 -0.507090  0.884576   a
5      2  0.371285  0.247642 -0.236017   t
   category_a  category_c  category_t
0           0           1           0
1           1           0           0
2           0           0           1
3           0           1           0
4           1           0           0
5           0           0           1


## 13.2 Create Model Descriptions with Patsy

In [81]:
# Creates a design matrix for a linear model; this is how you calculate best fit, look more at later
df4 = df3.rename({'x' : 'x0', 'y' : 'x1', 'z' : 'y'}, axis=1).drop('cat', axis=1)
print(df4)
y, X = patsy.dmatrices('y ~ x0 + x1', df4)
# y and X can be passed to modeling algorithms
m, c, _ = np.linalg.lstsq(X, y)[0]
X.design_info.column_names

   index        x0        x1         y
0      0 -0.676378 -1.060422  1.052266
1      1 -1.008403 -0.507090  0.884576
2      2  0.371285  0.247642 -0.236017
3      0 -0.676378 -1.060422  1.052266
4      1 -1.008403 -0.507090  0.884576
5      2  0.371285  0.247642 -0.236017


  m, c, _ = np.linalg.lstsq(X, y)[0]


['Intercept', 'x0', 'x1']

## 13.3 Introduction to statsmodels

In [87]:
# Can generate basic linear models, ANOVA, not Bayesian or ML technique
# Generally need to know more before useful

array([-0.87495658,  0.31120191,  1.24030454, -1.12004342, -0.3884077 ,
        0.93814285, -0.38924335, -0.03901993, -0.58522164, -0.78158158,
        0.78160203,  0.10224077, -0.18817524,  0.32067118,  0.67286008,
        0.97329704,  0.46005419,  0.47053133, -0.38710177,  0.61971866,
        0.29706809,  0.66270252, -0.99231237,  1.6971995 , -0.31356914,
       -1.09448183,  0.98947728, -0.24269465,  0.53419278, -0.41234328,
        0.06684822, -1.11210295, -0.91731703, -0.94097966,  0.0942037 ,
       -1.90721074, -0.22879527, -0.90579541,  0.12145763,  0.70936227,
        0.80570331, -1.34562689, -0.88520595,  0.72661909, -2.08314237,
        1.66335156,  1.71551676, -0.40011703,  2.11626804,  0.65059548,
       -0.34345205, -0.32895591,  0.38350458,  0.3993298 ,  0.08543497,
        0.17092936,  0.08289908, -0.79115884, -0.26846622, -0.09880826,
        0.46880687, -0.9679511 ,  0.89040802, -0.17127164,  0.3465699 ,
       -0.34897462, -1.37184923,  0.96717264, -0.72159782,  1.73

## 13.4  Introduction to scikit-learn

In [None]:
# Can be used for predictive models, unsupervised, supervised etc

In [None]:
train = (pd
         .read_html('https://github.com/wesm/pydata-book/blob/3rd-edition/datasets/titanic/train.csv')[0]
         .drop('Unnamed: 0', axis=1)
        )

train = (pd
         .read_html('https://github.com/wesm/pydata-book/blob/3rd-edition/datasets/titanic/test.csv')[0]
         .drop('Unnamed: 0', axis=1)
        )



In [99]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
