In [1]:
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
crime_2018_cleaned = pd.read_csv('../data/processed/crime_2018_cleaned.csv', low_memory=False)

In [3]:
X = np.array(crime_2018_cleaned.drop('Arrest', axis = 1))
y = crime_2018_cleaned['Arrest']

In [4]:
#How many arrest cases are present in our dataset?
print(crime_2018_cleaned['Arrest'].value_counts())
print(crime_2018_cleaned['Arrest'].value_counts()[0]/crime_2018_cleaned['Arrest'].value_counts()[1])

0    214170
1     53393
Name: Arrest, dtype: int64
4.011199970033525


In [5]:
scaler = preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)

In [6]:
X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_tr, X_cv, y_tr, y_cv = train_test_split(X_1, y_1, test_size=0.2,random_state=1)

In [None]:
#Calculating for finding Best C
roc_tr=[]
roc_cv=[]
max_auc_score=0
C_best=0
tuned_parameters =[10**-4, 10**-2, 10**0, 10**2, 10**4]
for i in tuned_parameters:
    lr=LogisticRegression(C=i, penalty='l1', class_weight= 'balanced')
    # fitting the model on train data
    lr.fit(X_tr,y_tr)
     #predict the response on the crossvalidation 
    pred_cv = lr.predict_proba(X_cv)
    pred_cv=(pred_cv)[:,1]
    roc_cv.append(roc_auc_score(y_cv,pred_cv))
    
     # predict the response on the traininig
    pred_tr = lr.predict_proba(X_tr)
    pred_tr=(pred_tr)[:,1]
    roc_tr.append(roc_auc_score(y_tr,pred_tr))
    #finding best c using max value of auc score
    if roc_auc_score(y_cv,pred_cv)>max_auc_score:
        C_best=i
        max_auc_score=roc_auc_score(y_cv,pred_cv)
        
print(C_best)        
print(max_auc_score)
C1=C_best
auc1=max_auc_score



In [None]:
fi = []
Balanced_Acc = []
F1 = []
auc = []
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for trainIndex, textIndex in skf.split(X, y):
    xTrain, xTest = X[trainIndex], X[textIndex]
    yTrain, yTest = y[trainIndex], y[textIndex]
    
    
    # fit an Extra Trees model to the data
    #model = ExtraTreesClassifier(n_estimators=20,n_jobs=4)
    #model = ExtraTreesClassifier(n_estimators=100) # by default n_estimator/tree is 10
    model = ExtraTreesClassifier(n_estimators=50)

    #model = ExtraTreesClassifier()
    model.fit(xTrain, yTrain)
    #model.fit(xTrain, yTrain,xTest,yTest)
    #exTreeModel=model.fit(xTrain, yTrain,xTest,yTest)
    #exTreeModelAcc=compute_accuracy(model,xTrain, yTrain,xTest,yTest)
    
    fi.append(model.feature_importances_)
    
    sfm = SelectFromModel(model, threshold=0.05) # considering the importance greater than 0.05
    sfm.fit(xTrain, yTrain)
    xTrain = sfm.transform(xTrain)
    xTest = sfm.transform(xTest)
    clr=LogisticRegression(C=1, penalty='l1', class_weight= 'balanced')
    clf.fit(xTrain, yTrain)
    yPred = clf.predict(xTest)
    Balanced_Acc.append(balanced_accuracy_score(yTest, yPred))
    F1.append(f1_score(yTest,yPred))
    auc.append(roc_auc_score(yTest, yPred))

# display the relative importance of each attribute
importance=pd.Series(np.mean(fi,axis=0))
selected_features_importance=pd.DataFrame({'Feature' :list(df_feature.columns),'Importance' :importance})
print(selected_features_importance.sort_values(by='Importance'))
#print(pd.DataFrame(fi).mean(axis = 0))
print("Balanced Accuracy: ", np.mean(Balanced_Acc))
print("F1: ", np.mean(F1))
