In [1]:
# 1. DATA RETRIEVAL --> Use panda framework to read csv 
# 2. FEATURE EXTRACTION & ENGINEERING --> EXTRACT training_features and outcome_labels. 
#    FROM training_features --> a) numeric_features + b) categorical_features
#    a) numeric_features --> Use skicit-learning to zero-median, normalize features
#    b) categorical_features --> Use pd.getdummies
# 3. Modeling : Use logistic regression 
#    http://faculty.cas.usf.edu/mbrannick/regression/Logistic.html
# 4. Model evaluation : 



# Problems :
# Mathematical reprenstation of logistic regression

In [2]:
import pandas as pd

data = pd.read_csv("student_records.csv")

data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [3]:
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']

training_features = data[feature_names]
outcome_label = data['Recommend']

training_features
outcome_label

0    Yes
1    Yes
2     No
3     No
4     No
5    Yes
6     No
7     No
Name: Recommend, dtype: object

In [4]:
# Show numerical features
numerical_feature_names = ['ResearchScore', 'ProjectScore']
numerical_features = training_features[numerical_feature_names]

numerical_features


# Show categorical features 
categorical_feature_names = ['OverallGrade', 'Obedient']
categorical_features = training_features[categorical_feature_names]

categorical_features

Unnamed: 0,OverallGrade,Obedient
0,A,Y
1,C,N
2,F,N
3,B,Y
4,E,N
5,A,Y
6,B,Y
7,C,Y


In [5]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(numerical_features)

training_features[numerical_feature_names] = ss.transform(numerical_features)

training_features



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [6]:
training_features = pd.get_dummies(training_features, columns=categorical_feature_names)

training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [7]:
# Use logistic regression to predict

from sklearn.linear_model import LogisticRegression
import numpy as np

lr = LogisticRegression()
model = lr.fit(training_features, np.array(outcome_label))

model


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
# Model evaluation on training set

pred_labels = model.predict(training_features)
actual_label = outcome_label

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

accuracy_score(actual_label, pred_labels) * 100

print(classification_report(actual_label, pred_labels))


             precision    recall  f1-score   support

         No       1.00      1.00      1.00         5
        Yes       1.00      1.00      1.00         3

avg / total       1.00      1.00      1.00         8



In [9]:
# Model deployment 

from sklearn.externals import joblib
import os

if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')

joblib.dump(model, r'Model/model.pickle')
joblib.dump(ss, r'Scaler/scaler.pickle')



['Scaler/scaler.pickle']

In [10]:
# Load model and scaler on server

model = joblib.load(r'Model/model.pickle')
scaler = joblib.load(r'Scaler/scaler.pickle')

# Load new data frame
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F',
                   'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20}, 
                        {'Name': 'Thomas', 'OverallGrade': 'A',
                   'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])

new_data = new_data[['Name', 'OverallGrade', 'Obedient',
                              'ResearchScore', 'ProjectScore']]

new_data




Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [16]:
# Predict

prediction_features = new_data[feature_names]

prediction_features[numerical_feature_names] = scaler.transform(prediction_features[numerical_feature_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
prediction_features = pd.get_dummies(prediction_features, columns=categorical_feature_names)

prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [18]:
# Add missing categorical feature columns 

current_categorical_feature_names = set(prediction_features.columns) - set(numerical_feature_names)

current_categorical_feature_names

{'Obedient_N', 'Obedient_Y', 'OverallGrade_A', 'OverallGrade_F'}

In [19]:
print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [24]:
categorical_engineered_features = set(training_features.columns) - set(numerical_features)

categorical_engineered_features

missing_features = set(categorical_engineered_features) - set(current_categorical_feature_names)

missing_features

for element in missing_features:
    prediction_features[element] = [0] * len(prediction_features)
    
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_E,OverallGrade_B,OverallGrade_C
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [25]:
predictions = model.predict(prediction_features)

new_data['Recommend'] = predictions

new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
