<a href="https://colab.research.google.com/github/jblancoperez/fiap-desafio/blob/main/C%C3%B3pia_de_00_analise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sys
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import plot_confusion_matrix



from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR




#!{sys.executable} -m pip install -U pandas-profiling[notebook]
#!jupyter nbextension enable --py widgetsnbextension

#!{sys.executable} -m pip install -U lafrom lazypredict.Supervised import LazyClassifier, LazyRegressor

def cm_to_inch(value):
    return value/2.54
#arquivo = pd.read_csv('https://raw.githubusercontent.com/jblancoperez/fiap-desafio/main/solicitacoescredito.csv')

In [None]:
arquivo = pd.read_csv('./solicitacoescredito.csv')
aprovadosAnalista=arquivo.loc[arquivo['status'].isin(['AprovadoAnalista','AprovadoComite'])]
aprovadosAnalista.head()








In [19]:
#Feature selection class to eliminate multicollinearity

#Baseado em  https://www.youtube.com/watch?v=ioXKxulmwVQ&feature=youtu.be
class MultiCollinearityEliminator():
    
    #Class Constructor
    def __init__(self, df, target, threshold):
        self.df = df
        self.target = target
        self.threshold = threshold

    #Method to create and return the feature correlation matrix dataframe
    def createCorrMatrix(self, include_target = False):
        #Checking we should include the target in the correlation matrix
        if (include_target == False):
            df_temp = self.df.drop([self.target], axis =1)
            
            #Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
            #Setting min_period to 30 for the sample size to be statistically significant (normal) according to 
            #central limit theorem
            corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
        #Target is included for creating the series of feature to target correlation - Please refer the notes under the 
        #print statement to understand why we create the series of feature to target correlation
        elif (include_target == True):
            corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
        return corrMatrix

    #Method to create and return the feature to target correlation matrix dataframe
    def createCorrMatrixWithTarget(self):
        #After obtaining the list of correlated features, this method will help to view which variables 
        #(in the list of correlated features) are least correlated with the target
        #This way, out the list of correlated features, we can ensure to elimate the feature that is 
        #least correlated with the target
        #This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
        
        #Obtaining the correlation matrix of the dataframe (along with the target)
        corrMatrix = self.createCorrMatrix(include_target = True)                           
        #Creating the required dataframe, then dropping the target row 
        #and sorting by the value of correlation with target (in asceding order)
        corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)                    
        #print(corrWithTarget, '\n')
        return corrWithTarget

    #Method to create and return the list of correlated features
    def createCorrelatedFeaturesList(self):
        #Obtaining the correlation matrix of the dataframe (without the target)
        corrMatrix = self.createCorrMatrix(include_target = False)                          
        colCorr = []
        #Iterating through the columns of the correlation matrix dataframe
        for column in corrMatrix.columns:
            #Iterating through the values (row wise) of the correlation matrix dataframe
            for idx, row in corrMatrix.iterrows():                                            
                if(row[column]>self.threshold) and (row[column]<1):
                    #Adding the features that are not already in the list of correlated features
                    if (idx not in colCorr):
                        colCorr.append(idx)
                    if (column not in colCorr):
                        colCorr.append(column)
        #print(colCorr, '\n')
        return colCorr

    #Method to eliminate the least important features from the list of correlated features
    def deleteFeatures(self, colCorr):
        #Obtaining the feature to target correlation matrix dataframe
        corrWithTarget = self.createCorrMatrixWithTarget()                                  
        for idx, row in corrWithTarget.iterrows():
            #print(idx, '\n')
            if (idx in colCorr):
                self.df = self.df.drop(idx, axis =1)
                break
        return self.df

    #Method to run automatically eliminate multicollinearity
    def autoEliminateMulticollinearity(self):
        #Obtaining the list of correlated features
        colCorr = self.createCorrelatedFeaturesList()                                       
        while colCorr != []:
            #Obtaining the dataframe after deleting the feature (from the list of correlated features) 
            #that is least correlated with the taregt
            self.df = self.deleteFeatures(colCorr)
            #Obtaining the list of correlated features
            colCorr = self.createCorrelatedFeaturesList()                                     
        return self.df

In [17]:
#Remove MultiCollinearity in input
arquivo = pd.read_csv('./solicitacoescredito.csv')
df=arquivo.loc[arquivo['status'].isin(['AprovadoAnalista','AprovadoComite'])]

df=df.drop(['numero_solicitacao','razaoSocial','nomeFantasia','cnpjSemTraco','dataAprovadoEmComite','dataAprovadoNivelAnalista'],axis=1)


invalidDates = [
    '0019-02-06T03:06:00',
    '0001-01-01T03:06:00',
    '0001-01-01T06:12:00',
    '0019-02-06T03:06:00',
    '0219-12-31T03:06:00'
]

for d in invalidDates:
    df['periodoBalanco']=df['periodoBalanco'].replace(d,np.nan)
df['periodoBalanco']=pd.to_datetime(df['periodoBalanco'],errors='coerce')
df['primeiraCompra']=pd.to_datetime(df['primeiraCompra'],errors='coerce')



df['eprimeira']=df['primeiraCompra'].isna()
df=df.drop(['primeiraCompra'],axis=1)

ftRemover = MultiCollinearityEliminator(df,'valorAprovado',0.7)
df=ftRemover.autoEliminateMulticollinearity()
df=df.drop(['periodoBalanco'],axis=1)
df.info()
df=pd.get_dummies(df, columns=["definicaoRisco","empresa_MeEppMei","restricoes","intervaloFundacao","status"], prefix=["definicaoRisco","mei","restricoes","intervaloFundacao","status"])
#Create dummy columns
df=df.fillna(method='bfill',axis=1)
y = df['valorAprovado']
X= df.drop(['valorAprovado'], axis=1)
X = StandardScaler().fit_transform(X)


                             valorAprovado
percentualProtestos               0.008198
eprimeira                         0.014801
dashboardCorrelacao               0.030369
valorSolicitado                   0.048894
duplicatasAReceber                0.049847
anoFundacao                       0.052694
passivoCirculante                 0.056105
margemBrutaAcumulada              0.059794
diferencaPercentualRisco          0.060300
percentualRisco                   0.060300
maiorAtraso                       0.067067
totalAtivo                        0.070653
scorePontualidade                 0.081615
ativoCirculante                   0.090800
prazoMedioRecebimentoVendas       0.099644
totalPatrimonioLiquido            0.101533
endividamento                     0.141683
margemBruta                       0.173502
periodoDemonstrativoEmMeses       0.182527
faturamentoBruto                  0.188339
custos                            0.199530
estoque                           0.205265
limiteEmpre

In [18]:
names = ["Nearest Neighbors", 
         "Linear SVM", 
         "RBF SVM",
         "Gamma SVR"
         #"Gaussian Process",
         "Decision Tree", 
         "Random Forest", 
         "Neural Net", 
         "AdaBoost",
         #"Naive Bayes", 
         #"QDA"
        ]

classifiers = [
    KNeighborsRegressor(3),
    SVR(kernel="linear", C=0.025),
    SVR(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeRegressor(max_depth=5),
    RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1),
    MLPRegressor(alpha=1, max_iter=1000),
    AdaBoostRegressor(),
    #QuadraticDiscriminantAnalysis()
    ]
l=len(classifiers)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)




from sklearn.model_selection import cross_val_score


for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = cross_val_score(clf,X,y,scoring = "neg_mean_squared_error")
    
    from sklearn.metrics import mean_absolute_percentage_error

    print (name)
    print ("mean_squared_error",score)
    score = cross_val_score(clf,X,y,scoring = "neg_mean_absolute_percentage_error")

    print ("mean_absolute_percentage_error",score)
    y_predict=clf.predict(X_test)
    saida = { "test": y_test, "predict" : y_predict}
    newDf= pd.DataFrame(saida)
    newDf.head()
    
    

#plt.xlabel('cake size and toppings')
#plt.ylabel('cake price')
#    predictions = clf.predict(y_test)
#    v1,v2=[],[]
#    for i,prediction in enumerate(predictions):
#        print(f'predicted value : {prediction[0]:.02f} vs target value: {y_test[i][0]}')
#    v1.append(prediction[0])
 #   v2.append(y_test[i][0])
#print(f'R-squared : {model.score(x1_test,y_test)}')
#ax.plot(v1,color='g',linestyle='--')
#ax.plot(v2,color='r',linestyle='--')
#plt.grid(True,linestyle='-',linewidth='0.5')
#plt.show()
#plt.close(f)
    
    


  
    
   




Nearest Neighbors
mean_squared_error [-1.61531909e+11 -9.59256252e+10 -8.59395890e+10 -1.77799597e+11
 -2.62944657e+11]
mean_absolute_percentage_error [-4.12025197e+00 -3.31671967e+18 -3.62572654e+01 -1.17200677e+18
 -1.10883134e+00]
Linear SVM
mean_squared_error [-1.20622699e+11 -1.48622649e+11 -1.80134224e+11 -5.31904438e+11
 -6.13619665e+11]
mean_absolute_percentage_error [-1.38547075e+00 -2.38644983e+17 -2.68139979e+01 -2.09064866e+17
 -1.10742738e+00]
RBF SVM
mean_squared_error [-1.20644613e+11 -1.48631071e+11 -1.80245871e+11 -5.31957074e+11
 -6.13681622e+11]
mean_absolute_percentage_error [-1.39825926e+00 -2.37972404e+17 -2.65675406e+01 -2.08235090e+17
 -1.11341931e+00]
Gamma SVRDecision Tree
mean_squared_error [-4.53917982e+10 -3.42286087e+10 -8.25279025e+09 -3.18694606e+10
 -5.47756642e+10]
mean_absolute_percentage_error [-6.71145870e-01 -4.16623485e+18 -2.77055685e+01 -1.24807120e+18
 -6.34080050e-01]
Random Forest
mean_squared_error [-7.71164882e+10 -8.01582655e+10 -8.3051939



Neural Net
mean_squared_error [-1.43419924e+11 -1.22217246e+11 -1.28118770e+11 -3.74217614e+11
 -4.35819943e+11]




mean_absolute_percentage_error [-4.69532840e+00 -1.94183659e+18 -1.53264952e+01 -1.04897980e+18
 -2.69348171e+00]
AdaBoost
mean_squared_error [-8.52233879e+10 -2.99015269e+10 -3.57981448e+10 -5.75115231e+10
 -1.00916712e+11]
mean_absolute_percentage_error [-1.10618151e+01 -4.21244150e+18 -4.52878580e+02 -1.99408552e+18
 -5.72637403e+00]


In [None]:

from sklearn.metrics import SCORERS
print(SCORERS.keys())

In [None]:
newDf.head()