## Paso 0: Preparación del entorno

#### Importar librerías

In [1]:
#print "IMPORTING LIBRARIES..."
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
import re
import requests
from scipy import stats
from requests.auth import HTTPBasicAuth

%matplotlib inline

In [2]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

#### Descargar ficheros

In [3]:
#DOWLOADING FILE FROM DROPBOX FIRST TIME
import urllib2
import os.path
import time
import random

while not os.path.exists('dev.csv') or not os.path.exists('oot0.csv'):
    time.sleep (3*random.random()); #Sleeping less than 3 seconds before going to Dropbox - avoid too many students at once.
    if not os.path.exists('dev.csv'):
        print "DOWLOADING FILE dev.csv FROM DROPBOX BECAUSE LOCAL FILE DOES NOT EXIST!"
        csvfile = urllib2.urlopen("https://dl.dropboxusercontent.com/u/28535341/dev.csv")
        output = open('dev.csv','wb')
        output.write(csvfile.read())
        output.close()
    if not os.path.exists('oot0.csv'):
        print "DOWLOADING FILE oot0.csv FROM DROPBOX BECAUSE LOCAL FILE DOES NOT EXIST!"
        csvfile = urllib2.urlopen("https://dl.dropboxusercontent.com/u/28535341/oot0.csv")
        output = open('oot0.csv','wb')
        output.write(csvfile.read())
        output.close()  
#DOWLOADING FILE FROM DROPBOX FIRST TIME

#### Cargar ficheros

In [4]:
print "LOADING DATASETS..."
df = pd.read_csv("../dev.csv") #DEV-SAMPLE
dfo = pd.read_csv("../oot0.csv") #OUT-OF-TIME SAMPLE

LOADING DATASETS...


In [None]:
df.describe()

In [None]:
dfo.describe()

#### Identificar tipos de datos

In [5]:
print "IDENTIFYING TYPES..."
in_model = []
list_ib = set()  #input binary
list_icn = set() #input categorical nominal
list_ico = set() #input categorical ordinal
list_if = set()  #input numerical continuos (input float)
list_inputs = set()
output_var = 'ob_target' # result used later
for var_name in df.columns:
    if re.search('^i',var_name) and var_name<>'id':
        list_inputs.add(var_name)
        #print var_name,"is input"   
    if re.search('^ib_',var_name):
        list_ib.add(var_name)
        #print var_name,"is input binary"
    elif re.search('^icn_',var_name):
        list_icn.add(var_name)
        #print var_name,"is input categorical nominal"
    elif re.search('^ico_',var_name):
        list_ico.add(var_name)
        #print var_name,"is input categorical ordinal"
    elif re.search('^if_',var_name):
        list_if.add(var_name)
        #print var_name,"is input numerical continuos (input float)"
    elif re.search('^ob_',var_name):
        output_var = var_name
    else:
        print "ERROR: unable to identify the type of:", var_name

IDENTIFYING TYPES...
ERROR: unable to identify the type of: id


## Paso 1: Transformaciones

In [None]:
print "STEP 1: DOING MY TRANSFORMATIONS..."


#### Transforming NaN values

In [6]:
def variablesNaN(data, features):
    result = pd.DataFrame(index = ['NaN'], columns = features)
    for var_name in features:
        if data[var_name].isnull().any() == True:
            result[var_name] = True
    
    return result

# Dev
df_nan = variablesNaN(df, list_inputs)
df_nan.dropna(axis=1).columns

# OoT
dfo_nan = variablesNaN(dfo, list_inputs)
dfo_nan = dfo_nan.dropna(axis=1).columns
dfo_nan

Index([u'ico_var_33', u'ico_var_37', u'ib_var_21', u'ib_var_20', u'ib_var_18',
       u'ib_var_19', u'ib_var_15', u'ib_var_16', u'ib_var_17', u'ico_var_26',
       u'if_var_78', u'icn_var_22', u'icn_var_24'],
      dtype='object')

In [7]:
for var in list_inputs:
    if dfo[var].isnull().any() == True:
        if var in list_ib:
            #dfo['ib_var_18'].median()
            #mode_var = stats.mode(dfo[var])
            #dfo[var] = dfo[var].fillna(mode_var[0].item(0))
            dfo[var] = dfo[var].fillna(-1)
        if var in list_ico:
            dfo[var] = dfo[var].fillna(dfo[var].median())
            #dfo[var] = dfo[var].fillna(-1)
        if var in list_icn:
            #dfo[var] = dfo[var].fillna(-1)
            dfo[var] = dfo[var].fillna(dfo[var].median())
        if var in list_if:
            mode_var = stats.mode(dfo[var])
            dfo[var] = dfo[var].fillna(mode_var[0].item(0))
            #dfo[var] = dfo[var].fillna(dfo[var].mean())
            #dfo[var] = dfo[var].fillna(-1)
        

## Paso 2: Selección de variables

In [23]:
def giniOOT (data, Y_pred):
    data['pred'] = Y_pred
    data_tosend = data[list(['id','pred'])]

    i=0
    filename = "group_Z_sub"+str(i)+".csv"
    data_tosend.to_csv(filename, sep=',')

    url = 'http://mgadi.pythonanywhere.com/api/v1.0/uploadpredictions'

    files = {'file': (filename, open(filename, 'rb'))}
    rsub = requests.post(url, files=files, auth=HTTPBasicAuth('josegonzalez', 'levante'))
    resp_str = str(rsub.text)
    ginioot = resp_str.split(";")[1]
    ginivalue = ginioot.split("=")[1]
    ginivalue = float(ginivalue)
    time.sleep(15)
    return ginivalue


#### Genetic Algorithm

In [None]:
print "GENETIC ALGORITHM FOR FEATURE SELECTION:"

from deap import creator, base, tools, algorithms #GENETIC ALGORITHM LIBRARY - requirement: pip install deap
import random
from sklearn import metrics
import time
from math import sqrt

#####
#SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION)
#####
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalOneMax(individual):
    return sum(individual),

toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

NPOPSIZE = 50 #RANDOM STARTING POOL SIZE
population = toolbox.population(n=NPOPSIZE)


#####
#ASSESSING GINI ON THE STARTING POOL
#####

def calculateGini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

dic_gini={}

df_var = set()

for i in range(np.shape(population)[0]): 

    # TRASLATING DNA INTO LIST OF VARIABLES (1-81)
    var_model = []    
    for j in range(np.shape(population)[0]): 
        if (population[i])[j]==1:
            var_model.append(list(list_inputs)[j])

    # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL 
    X_train=df[var_model]
    Y_train=df[output_var]
    Xo_train=dfo[var_model]

    ######
    # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
    #####
    rfc = RandomForestClassifier(n_estimators=100, min_weight_fraction_leaf=0.00000000001, class_weight={0: 1, 1: 8},
                                        criterion='entropy')
    model = rfc.fit(X_train, Y_train)
    Y_predict = model.predict_proba(X_train)[:,1]
    
    #lr = sm.Logit(Y_train, X_train)
    #model=lr.fit()   
    #Y_predict=model.predict(X_train)
    
    ######
    # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
    #####             


    ######
    # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD USE A DIFFERENT GINI. 
    # EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
    #####
    
    # OOT GINI
    Yo_predict = model.predict_proba(Xo_train)[:,1]
    gini_oot = giniOOT (dfo, Yo_predict)
    
    # DEV GINI
    fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
    auc = metrics.auc(fpr, tpr)
    gini_dev = abs(2*auc-1)
    #gini_power = abs(2*auc-1)
    
    # CALC
    gini_power = sqrt(gini_dev*gini_oot)
        
    ######
    # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. 
    # EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
    #####                
    
    gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
    dic_gini[gini]=population[j]   
    
list_gini=sorted(dic_gini.keys(),reverse=True)


#####
#GENETIC ALGORITHM MAIN LOOP - START
# - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES)
#####
sum_current_gini=0.0
sum_current_gini_1=0.0
sum_current_gini_2=0.0
first=0
OK = 1
a=0

while OK:  #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS
    a=a+1
    print 'loop ', a
    OK=0

    ####
    # GENERATING OFFSPRING - START
    ####
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10%
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population =toolbox.select(offspring, k=len(population))
    ####
    # GENERATING OFFSPRING - END
    ####

    sum_current_gini_2=sum_current_gini_1
    sum_current_gini_1=sum_current_gini
    sum_current_gini=0.0

    #####
    #ASSESSING GINI ON THE OFFSPRING - START
    #####
    for j in range(np.shape(population)[0]): 
        if population[j] not in dic_gini.values(): 
            var_model = [] 
            for i in range(np.shape(population)[0]): 
                if (population[j])[i]==1:
                    var_model.append(list(list_inputs)[i])
            
            X_train=df[var_model]
            Y_train=df[output_var]
            Xo_train=dfo[var_model]
            
            ######
            # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
            #####
            rfc = RandomForestClassifier(n_estimators=100, min_weight_fraction_leaf=0.00000000001, class_weight={0: 1, 1: 8},
                                        criterion='entropy')
            model = rfc.fit(X_train, Y_train)
            Y_predict = model.predict_proba(X_train)[:,1]

            #lr = sm.Logit(Y_train, X_train)
            #model=lr.fit()
            #Y_predict=model.predict(X_train)
            
            ######
            # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS.
            #####            
                       
            
            ######
            # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. 
            # EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
            #####
    
            # OOT GINI
            Yo_predict = model.predict_proba(Xo_train)[:,1]
            gini_oot = giniOOT (dfo, Yo_predict)
    
            # DEV GINI
            fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict)
            auc = metrics.auc(fpr, tpr)
            gini_dev = abs(2*auc-1)
            #gini_power = abs(2*auc-1)
    
            # CALC
            gini_power = sqrt(gini_dev*gini_oot)
               
            ######
            # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. 
            # EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI)
            #####                       
           
            gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','')
            dic_gini[gini]=population[j]  
    #####
    #ASSESSING GINI ON THE OFFSPRING - END
    #####

    #####
    #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START
    #####           
    list_gini=sorted(dic_gini.keys(),reverse=True)
    population=[]
    for i in list_gini[:NPOPSIZE]:
        population.append(dic_gini[i])
        gini=float(i.split(';')[0])
        sum_current_gini+=gini
    #####
    #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END
    #####           
      
    #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS
    print 'sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2
    if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001):
        OK=1
#####
#GENETIC ALGORITHM MAIN LOOP - END
#####

gini_max=list_gini[0]        
gini=float(gini_max.split(';')[0])
features=gini_max.split(';')[1]


####
# PRINTING OUT THE LIST OF FEATURES
#####
f=0
for i in range(len(features)):
    if features[i]=='1':
        f+=1
        df_var.add(list(list_inputs)[i])
        print 'feature ', f, ':', list(list_inputs)[i]
print 'gini: ', gini


#### Seleccionar variables para el modelo

Selecciono las variables que tienen un Information Value mayor que 0.1.

In [28]:
print "SELECTING CHARACTERISTICS TO ENTER INTO THE MODEL..."

#in_model = list_inputs #['ib_var_1','icn_var_22','ico_var_25','if_var_65']

in_model = df_var

in_model

SELECTING CHARACTERISTICS TO ENTER INTO THE MODEL...


['ico_var_49',
 'if_var_70',
 'ico_var_33',
 'ico_var_37',
 'ico_var_36',
 'ico_var_35',
 'ico_var_34',
 'ib_var_21',
 'ib_var_20',
 'ico_var_30',
 'if_var_67',
 'ico_var_62',
 'ico_var_38',
 'ico_var_51',
 'ico_var_53',
 'ico_var_55',
 'ico_var_54',
 'ico_var_58',
 'ico_var_61',
 'if_var_79',
 'ib_var_19',
 'ico_var_45',
 'ib_var_15',
 'ib_var_10',
 'ib_var_11',
 'ib_var_12',
 'ib_var_13',
 'if_var_74',
 'ico_var_25',
 'ico_var_26',
 'if_var_78',
 'ico_var_43',
 'ico_var_40',
 'ico_var_41',
 'ico_var_46',
 'ib_var_8',
 'ib_var_9',
 'ib_var_6',
 'ib_var_7',
 'ib_var_2',
 'ib_var_3',
 'ib_var_1',
 'ico_var_60',
 'ico_var_59',
 'if_var_66',
 'icn_var_23',
 'icn_var_22',
 'ico_var_48',
 'icn_var_24']

## Paso 3: Desarrollo del modelo

In [29]:
print "STEP 3: DEVELOPING THE MODEL..."

# Filtrado de variables

X = df[list(set(in_model))]
y = df[output_var]
Xo = dfo[list(set(in_model))]


STEP 3: DEVELOPING THE MODEL...


#### Modelo 1

In [30]:
cutoff = 0.8

clf = RandomForestClassifier(n_estimators=7000, 
                             min_weight_fraction_leaf=0.00000000001,
                             class_weight={0: 1, 1: 8},
                             #class_weight='balanced',
                             criterion='entropy')
model = clf.fit(X,y)


#### Modelo 2

#### Predicción

In [31]:
try:
    y_pred  = model.predict_proba(X)
    y_pred = y_pred[:,1]
    yo_pred = model.predict_proba(Xo)
    yo_pred = yo_pred[:,1]    
    
except np.linalg.linalg.LinAlgError as err:
    if 'Singular matrix' in err.message:
        print "MODEL-INVALID (Singular Matrix)"
    else:
        raise


## Paso 4: Enviar resultados

In [32]:
print "STEP 4: ASSESSING THE MODEL..."
# CALCULATING GINI PERFORMANCE ON DEVELOPMENT SAMPLE
from sklearn.metrics import roc_auc_score
gini_score = 2*roc_auc_score(df[output_var], y_pred)-1
print "GINI DEVELOPMENT=", gini_score

print "STEP 5: SUBMITTING THE RESULTS..."
dfo['pred'] = yo_pred
dfo_tosend = dfo[list(['id','pred'])]

i=3
filename = "group_Z_sub"+str(i)+".csv"
dfo_tosend.to_csv(filename, sep=',')

url = 'http://mgadi.pythonanywhere.com/api/v1.0/uploadpredictions'

files = {'file': (filename, open(filename, 'rb'))}
rsub = requests.post(url, files=files, auth=HTTPBasicAuth('josegonzalez', 'xx'))
resp_str = str(rsub.text)
print "RESULT SUBMISSION: ", resp_str


STEP 4: ASSESSING THE MODEL...
GINI DEVELOPMENT= 1.0
STEP 5: SUBMITTING THE RESULTS...
RESULT SUBMISSION:  KS2 = 0.432624031509; GINI = 0.566824691655
