# Prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

from sklearn.linear_model import LogisticRegression # Logistic Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics

## Framework
1. Model
2. Fit
3. Predict

In [2]:
# data
fashionBIG = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionBIG.csv")
fashionBIG.head()

Unnamed: 0,age,income,months_subbed,upgrade
0,22,55.89,14,0
1,32,86.03,57,0
2,38,49.22,37,1
3,14,92.71,51,1
4,33,94.06,37,0


In [3]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size=0.2)
X_train.head()

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [4]:
myLogit = LogisticRegression(penalty = "none" ) #create

SyntaxError: invalid syntax (<ipython-input-4-886f3b7af015>, line 1)

In [6]:
myLogit.fit(Xz_train,y_train) #fit

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
predictedVals = myLogit.predict(Xz_test) #predict

In [8]:
accuracy_score(y_test,predictedVals)

0.545

In [9]:
confusion_matrix(y_test,predictedVals)

array([[21, 65],
       [26, 88]])

## LR Coef interpretation


In [22]:
coef = pd.DataFrame({"Coefs": myLogit.coef_[0], "Names": predictors})
coef = coef.append({"Coefs": myLogit.intercept_[0], "Names": "intercept"}, ignore_index = True)
coef

Unnamed: 0,Coefs,Names
0,0.4428,age
1,-0.026811,income
2,-0.031354,months_subbed
3,0.342912,intercept


In [23]:
coef["Odds Coefs"] = np.exp(coef["Coefs"])
coef

Unnamed: 0,Coefs,Names,Odds Coefs
0,0.4428,age,1.557061
1,-0.026811,income,0.973545
2,-0.031354,months_subbed,0.969132
3,0.342912,intercept,1.409045


## LR different thresholds


In [25]:
fashionNEW = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionNEW.csv")

Xnew = fashionNEW.iloc[:,0:3]
Xnewz = zscore.transform(Xnew)

In [26]:
Ypred_prob = myLogit.predict_proba(Xnewz)
Ypred_prob[1:10]

array([[0.32937155, 0.67062845],
       [0.51210468, 0.48789532],
       [0.51056941, 0.48943059],
       [0.40192179, 0.59807821],
       [0.43966231, 0.56033769],
       [0.49023469, 0.50976531],
       [0.40823083, 0.59176917],
       [0.35609815, 0.64390185],
       [0.61418534, 0.38581466]])

In [27]:
Ypred_prob1 = Ypred_prob[:, 1]
Ypred_prob1[1:100]

array([0.67062845, 0.48789532, 0.48943059, 0.59807821, 0.56033769,
       0.50976531, 0.59176917, 0.64390185, 0.38581466, 0.62833486,
       0.61288313, 0.34006166, 0.55541896, 0.49939229, 0.51752903,
       0.35290117, 0.56938308, 0.74891502, 0.57917998, 0.57449298,
       0.56753057, 0.62876465, 0.61784645, 0.42200799, 0.65472804,
       0.6076984 , 0.5189158 , 0.60852446, 0.56996182, 0.57868769,
       0.51314945, 0.44388772, 0.58785995, 0.44875066, 0.34425256,
       0.77133662, 0.66442165, 0.6285264 , 0.33139596, 0.56051804,
       0.73311748, 0.5940597 , 0.73723444, 0.53206644, 0.60974191,
       0.60846013, 0.70682223, 0.64986057, 0.56753535, 0.52552016,
       0.64461914, 0.65268661, 0.66045198, 0.61812289, 0.40624442,
       0.56100277, 0.72320559, 0.65895541, 0.53417609, 0.58730398,
       0.69607092, 0.59917861, 0.62330723, 0.51226263, 0.60271162,
       0.68029245, 0.66230753, 0.74995844, 0.59699854, 0.48230334,
       0.73003039, 0.33861372, 0.46751397, 0.45541736, 0.73910

In [28]:
thresh = 0.3

Ypred_prob1_thresh = (Ypred_prob1 > thresh) * 1
Ypred_prob1_thresh[1:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [29]:
accuracy_score(fashionNEW["upgrade"], Ypred_prob1_thresh)

0.603

## LR with Cross Validation

In [30]:
# Kfold

X = fashionBIG[["age","income","months_subbed"]]
y = fashionBIG["upgrade"]

# create k-fold object
kf = KFold(n_splits = 5)
kf.split(X)

lr = LogisticRegression() #create model

acc = [] #create empty list to store accuracy for each fold

In [31]:
# Use a for loop to loop through each fold and train a model, then add the accuracy to acc.

for train_indices, test_indices in kf.split(X):
    # Get your train/test for this fold
    X_train = X.iloc[train_indices]
    X_test  = X.iloc[test_indices]
    y_train = y[train_indices]
    y_test  = y[test_indices]
    
    # model
    model = lr.fit(X_train, y_train)
    # record accuracy
    acc.append(accuracy_score(y_test, model.predict(X_test)))
    
#print overall acc
print(acc)
np.mean(acc)

[0.58, 0.605, 0.54, 0.565, 0.645]


0.587

## Regularization

In [33]:
# Default Regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

myLogit = LogisticRegression() #create
myLogit.fit(Xz_train,y_train) #fit

print(myLogit.coef_)
print(myLogit.intercept_)

[[-0.03929939  0.02153877  0.03225226]]
[0.27186273]


In [34]:
myLogit2 = LogisticRegression(penalty = "none") #create
myLogit2.fit(Xz_train,y_train) #fit

print(myLogit2.coef_)
print(myLogit2.intercept_)

[[-0.03950742  0.02165258  0.0324128 ]]
[0.27186484]
