### Linear model code

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

logr = LogisticRegression(class_weight='balanced')

#create datasets for model
feature_train = bd_train.drop('Revenue Grid', axis=1)
label_train = bd_train['Revenue Grid']
feature_test = bd_test.drop('Revenue Grid', axis=1)
label_test = bd_test['Revenue Grid']

#Run the model
logr.fit(feature_train, label_train)

logr.predict(feature_test)
logr.predict_proba(feature_test) #This gives probability
logr.classes_ #This gives which probability column belongs to which prediction.

#get auc score
predicted_probs=logr.predict_proba(feature_test)[:,1]
roc_auc_score(label_test, predicted_probs)

#get the cutoff for hard classes.
cutoffs=np.linspace(0.01,0.99,99)
train_score=logr.predict_proba(feature_train)[:,1]
real=label_train

KS_all=[]

for cutoff in cutoffs:
    
    predicted = (train_score>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    KS=(TP/P)-(FP/N)
    KS_all.append(KS)

    cutoffs[KS_all==max(KS_all)]

### Decision tree
#### With permutation and combinatin of the tree related parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

params={ 'class_weight':[None,'balanced'], 
        'criterion':['entropy','gini'],
        'max_depth':[None,5,10,15,20,30,50,70],
            'min_samples_leaf':[1,2,5,10,15,20], 
            'min_samples_split':[2,5,10,15,20]
       }

clf=DecisionTreeClassifier()

random_search=RandomizedSearchCV(clf,cv=10,
                                 param_distributions=params,
                                 scoring='roc_auc',
                                 n_iter=10
                                    )
random_search.fit(x_train,y_train)

dtree=random_search.best_estimator_

dtree.fit(x_train,y_train)



------------

### Random forest
#### With additional ramdomness added on the feature selection, number of trees, bootstraping option.

In [None]:
from sklearn import tree
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier()

param_dist = {"n_estimators":[100,200,300,500,700,1000],
              "max_features": [5,10,20,25,30,35],
              "bootstrap": [True, False],
              'class_weight':[None,'balanced'], 
                'criterion':['entropy','gini'],
                'max_depth':[None,5,10,15,20,30,50,70],
                'min_samples_leaf':[1,2,5,10,15,20], 
                'min_samples_split':[2,5,10,15,20]
                  }

n_iter_search = 10

random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,scoring='roc_auc',cv=5)
random_search.fit(x_train, y_train)
random_search.best_estimator_

##Report function
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
report(random_search.cv_results_,5)

# select the best values from results above, they will vary slightly with each run
rf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=50, 
                          max_features=10, max_leaf_nodes=None, min_impurity_split=1e-07, 
                          min_samples_leaf=10, min_samples_split=20, min_weight_fraction_leaf=0.0, 
                          n_estimators=300, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)

rf.fit(x_train,y_train)

## Feature importance
feat_imp_df=pd.DataFrame({'features':x_train.columns,'importance':rf.feature_importances_})
feat_imp_df.sort_values('importance',ascending=False)




#### Data set ice breaker

1. Check if any numeric column was supposed to be categorical and numbers are just symbolic assignment rather than representing real ordinal relationship between the categories they represent ( keep in mind to treat them as categorical )

2. Check if any categorical column contains ordinal data and can be converted to numbers ( use appropriate string operations and convert to numbers )

3. Check if any column has come as categorical because of some character data occurrence , but contains numeric data as such ( convert these to numeric data)

4. Check if data contains missing values , and figure out how they can be imputed ( this can be different for different column, for some business logic might exist , for some simple imputation with median/avg will suffice )

5. See if any columns represent perfectly duplicate information, use any one of them 

6. If any column contains general text data , create tfidf features 

7. Missing value in a categorical column should be treated as any other category 

_______________________________

Once you have taken care of all this , check if the general patterns in the data are as per standard patterns seen in the business ( As a quick data sanity check ).

For example low credit scores having higher default rate ( so mean credit score should lower for defaulters , etc ).

If there is an anomaly , check with business process experts if there can be data integrity issues with what you got .

For numeric columns , density plots are a quick way of sanity check in terms of usual values occurrence frequency and presence of extreme values .

For categorical columns , this can be done with bar charts 

-------------------

Many problems are not of simply predictive nature. For example , question can be what factors affect a certain outcome and how  ( without much emphasis on building a predictive model) ?

You can run a quick random Forest anyway and choose top vars from variable importance plot and visualise their behaviour w.r.t. to outcome.

_______________________________

If you need to build a predictive model , start with knowing what is the baseline performance . This could come from couple of sources 

1. Minimum business expectation in terms of performance (Actionable/Viable) ...... accuracy/mae etc 

2. Performance of earlier best model in production 

Above will help you in discarding solutions which do not fit the minimum performance criterion and keep on moving towards more complex algos or data inclusions . 

It'll be good if you can get a tentative idea of ceiling levels for performance ( Not alway possible though ) . If you get this , it'll help you in realise when to stop .

#### practise Dataframe