In [57]:
from pydataset import data
import io
import numpy as np
from helper_functions import df_print_metadata
from helper_functions import df_peek
from helper_functions import df_print_summary
from helper_functions import series_is_whole_nums
from helper_functions import df_print_missing_vals
from helper_functions import df_percent_missing_vals

from helper_functions import evaluate_model_train
from helper_functions import df_print_r_and_p_values

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('classic')

# Modeling
import statsmodels.api as sm

from scipy.stats import pearsonr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from sklearn.feature_selection import f_regression

### Import Data

In [2]:
df = data('swiss')

In [4]:
df.columns

Index(['Fertility', 'Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [8]:
df.Catholic.describe()

count     47.00000
mean      41.14383
std       41.70485
min        2.15000
25%        5.19500
50%       15.14000
75%       93.12500
max      100.00000
Name: Catholic, dtype: float64

### Prepare Data

In [17]:
df['is_catholic']= df.Catholic > 50.0

In [30]:
df['is_catholic']= df['is_catholic'].astype(int)

In [34]:
df['is_catholic'] = np.where(df['is_catholic'], 'Catholic', 'Not Catholic')


In [35]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,22.2,Not Catholic
Delemont,83.1,45.1,6,9,22.2,Catholic
Franches-Mnt,92.5,39.7,5,5,20.2,Catholic
Moutier,85.8,36.5,12,7,20.3,Not Catholic
Neuveville,76.9,43.5,17,15,20.6,Not Catholic


In [21]:
df = df.drop(columns=['Catholic'])

In [36]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,22.2,Not Catholic
Delemont,83.1,45.1,6,9,22.2,Catholic
Franches-Mnt,92.5,39.7,5,5,20.2,Catholic
Moutier,85.8,36.5,12,7,20.3,Not Catholic
Neuveville,76.9,43.5,17,15,20.6,Not Catholic


In [37]:
df.describe()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality
count,47.0,47.0,47.0,47.0,47.0
mean,70.142553,50.659574,16.489362,10.978723,19.942553
std,12.491697,22.711218,7.977883,9.615407,2.912697
min,35.0,1.2,3.0,1.0,10.8
25%,64.7,35.9,12.0,6.0,18.15
50%,70.4,54.1,16.0,8.0,20.0
75%,78.45,67.65,22.0,12.0,21.7
max,92.5,89.7,37.0,53.0,26.6


### Modeling

### Train Test Split

In [28]:
target = df.is_catholic

In [29]:
x_vars = df.drop(columns=['is_catholic'])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(x_vars, target, train_size=.80, random_state=123)

In [39]:
train = pd.concat([X_train, y_train], axis=1)
print("Train columns:  %s" % list(train.columns))
print("Train dimensions (rows, columns):", train.shape)

Train columns:  ['Fertility', 'Agriculture', 'Examination', 'Education', 'Infant.Mortality', 'is_catholic']
Train dimensions (rows, columns): (37, 6)


In [40]:
test = pd.concat([X_test, y_test], axis=1)
print("Test columns:  %s" % list(test.columns))
print("Test dimensions (rows, columns):", test.shape)

Test columns:  ['Fertility', 'Agriculture', 'Examination', 'Education', 'Infant.Mortality', 'is_catholic']
Test dimensions (rows, columns): (10, 6)


In [43]:
y_train

Broye            True
Paysd'enhaut    False
Avenches        False
Aubonne         False
Oron            False
Sierre           True
Moudon          False
Rolle           False
Echallens       False
La Chauxdfnd    False
Glane            True
Morges          False
Rive Gauche      True
Moutier         False
Orbe            False
Neuchatel       False
Conthey          True
Grandson        False
Courtelary      False
Neuveville      False
Yverdon         False
Cossonay        False
Le Locle        False
Payerne         False
St Maurice       True
Sion             True
Sarine           True
Herens           True
Martigwy         True
Nyone           False
Lavaux          False
Lausanne        False
Boudry          False
Monthey          True
Vevey           False
Franches-Mnt     True
Rive Droite      True
Name: is_catholic, dtype: bool

### Decision Tree Model

In [42]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

In [44]:
clf.fit(X_train[['Education','Fertility']], y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [46]:
y_pred = clf.predict(X_train[['Education','Fertility']])
y_pred[0:5]

array([ True, False, False, False, False])

In [47]:
y_pred_proba = clf.predict_proba(X_train[['Education','Fertility']])
y_pred_proba

array([[0.33333333, 0.66666667],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [0.33333333, 0.66666667],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.

In [48]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train[['Education','Fertility']], y_train)))

Accuracy of Decision Tree classifier on training set: 0.89


In [49]:
confusion_matrix(y_train, y_pred)

array([[22,  2],
       [ 2, 11]])

In [50]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

       False       0.92      0.92      0.92        24
        True       0.85      0.85      0.85        13

   micro avg       0.89      0.89      0.89        37
   macro avg       0.88      0.88      0.88        37
weighted avg       0.89      0.89      0.89        37



### Linear Regression Model

In [51]:
lm1 = LinearRegression()
print(lm1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [52]:
lm1.fit(X_train[['Agriculture', 'Examination']], y_train)
print(lm1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [53]:
lm1_y_intercept = lm1.intercept_
print(lm1_y_intercept)

lm1_coefficients = lm1.coef_
print(lm1_coefficients)

0.8101211433715313
[ 0.00146734 -0.03320262]


In [54]:
print('Multivariate - final_exam = b + m1 * agriculture + m2 * catholic')
print('    y-intercept  (b): %.2f' % lm1_y_intercept)
print('    coefficient (m1): %.2f' % lm1_coefficients[0])
print('    coefficient (m2): %.2f' % lm1_coefficients[1])


Multivariate - final_exam = b + m1 * agriculture + m2 * catholic
    y-intercept  (b): 0.81
    coefficient (m1): 0.00
    coefficient (m2): -0.03


In [55]:
y_pred_lm1 = lm1.predict(X_train[['Agriculture', 'Examination']])
y_pred_lm1

array([ 0.3818866 ,  0.70408164,  0.26833898,  0.44433002,  0.51616444,
        0.83465043,  0.42613498,  0.36809358,  0.31900298, -0.1414564 ,
        0.44477023,  0.1674105 ,  0.12030881,  0.46524766,  0.2254519 ,
       -0.32614545,  0.83655798,  0.29556619,  0.33702661,  0.30950594,
        0.38471524,  0.18135025,  0.10416805,  0.43053701,  0.62266882,
        0.47107634,  0.34520304,  0.77572863,  0.52643583,  0.15435115,
        0.28638729, -0.02468062,  0.00319888,  0.6729333 ,  0.01938033,
        0.70236152,  0.34725732])

In [56]:
y_label = lm1_y_intercept
x_label = lm1_coefficients[0]

print("Model Evaluation on TRAIN Data")
mse = mean_squared_error(y_train, y_pred_lm1)
print(f"\tMSE: {mse:.3f}")

mae = median_absolute_error(y_train, y_pred_lm1)
print(f"\tMAE: {mae:.3f}")

r2 = r2_score(y_train, y_pred_lm1)
print(
        f"\t{r2:.2%} of the variance in {y_label} can be explained by {x_label}."
    )
print()

print("P-VALUE")
f_vals, p_vals = f_regression(X_train, y_pred_lm1)
print(f"\tTrain: {p_vals[0]:.3}")
print()


Model Evaluation on TRAIN Data
	MSE: 0.160
	MAE: 0.326
	29.82% of the variance in 0.8101211433715313 can be explained by 0.0014673423361143466.

P-VALUE
	Train: 0.000743



### KNN Model

In [58]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')


In [59]:
knn.fit(X_train[['Education','Fertility']], y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [60]:
y_pred = knn.predict(X_train[['Education','Fertility']])


In [61]:
y_pred_proba = knn.predict_proba(X_train[['Education','Fertility']])


In [62]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train[['Education','Fertility']], y_train)))

Accuracy of KNN classifier on training set: 0.81


In [63]:
print(confusion_matrix(y_train, y_pred))


[[22  2]
 [ 5  8]]


In [64]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

       False       0.81      0.92      0.86        24
        True       0.80      0.62      0.70        13

   micro avg       0.81      0.81      0.81        37
   macro avg       0.81      0.77      0.78        37
weighted avg       0.81      0.81      0.80        37



## Test on Test Data Set

In [66]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

In [67]:
clf.fit(X_train[['Education','Fertility']], y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [68]:
y_pred = clf.predict(X_train[['Education','Fertility']])
y_pred[0:5]

array([ True, False, False, False, False])

In [69]:
y_pred_proba = clf.predict_proba(X_train[['Education','Fertility']])
y_pred_proba

array([[0.33333333, 0.66666667],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.84615385, 0.15384615],
       [0.84615385, 0.15384615],
       [0.33333333, 0.66666667],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.84615385, 0.15384615],
       [1.        , 0.        ],
       [0.

In [75]:
X_test.shape


(10, 5)

In [76]:
len(y_test)

10

In [70]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

ValueError: Number of features of the model must match the input. Model n_features is 2 and input n_features is 5 

In [77]:
actual = y_test
yhat = dtree.predict(X_test[['Education','Fertility']])

print(accuracy_score(actual, yhat))
print(classification_report(actual, yhat))

NameError: name 'dtree' is not defined