In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import StackingClassifier

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import joblib
from pickle import dump
import pickle
import os
import shutil
import numpy as np

Import Classifiers

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [5]:
import pyodbc
import re

Create Dataset and vectorize

In [6]:
class DataSetByConnection:
    def __init__(self, server='ec2-52-44-139-108.compute-1.amazonaws.com', db='ddhhqf3lrh36s1', us='', pw='', sql = None):
        driver= '{PostgreSQL ODBC Driver(UNICODE)}'
        self.Connection = pyodbc.connect('DRIVER='+driver+';SERVER='+server+';PORT=5432;DATABASE='+db+';UID='+us+';PWD='+ pw+';sslmode=require;')
        self.dbName = db
        self.sql = sql
        
    def GetDataset(self):
        if(self.sql==None):
            self.sql = "SELECT court_abreviation, decision_date, class, court_entry, court_decisor, court_session, court_sumary FROM public.jurisprudences;"
        
        return pd.read_sql_query(self.sql, self.Connection)

In [7]:
class DFPrepare:
    def __init__(self, server='', db='', us='', pw='', sql = None, yColumName='CourtSumary'):
        self.server = server
        self.db = db
        self.us = us
        self.pw = pw
        self.sql = sql
        self.yColumName = yColumName
        
    def create_clean_df(self):
        dtc = DataSetByConnection(self.server, self.db, self.us, self.pw, sql=self.sql)
        df = dtc.GetDataset()
        print('\r\nQuery executada')
        
        df.columns = ['CourtAbreviation', 'Date', 'Class', 'CourtEntry', 'CourtDecisor', 'CourtSession', 'CourtSumary']
        
        df['Year'] = pd.to_datetime(df['Date'], format='%Y%m%d').dt.year
        
        df.drop(['Date'], axis='columns', inplace=True)

        
        df = self.vectorize(df)
        
        return df
    
    def vectorize(self, df):
        categorical_cols = [a for a in df.columns if a != 'Year']
        
        encoders = []
        encoderNames = []
        for x in categorical_cols:
            le = None
            le = LabelEncoder()
                
            le = le.fit(df[x])
            df[x] = le.transform(df[x])
            
            encoderNames.append(x)
            encoders.append(le)
        
        self.saveLe(encoders, encoderNames)
        
        return df
    
    def saveLe(self, encoders, encoderNames):
        foldername = "label_encoders/"+self.yColumName+"/"
        if not os.path.exists(foldername):
            os.makedirs(foldername)
        
        for encoder, name in zip(encoders, encoderNames):
            filename = foldername+name+'.pkl'
            #pickle.load(filename)
            with open(filename, 'wb') as file:
                dump(encoder, file)

Training dataset with "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes"

In [8]:
class DataTraining:
    def __init__(self, yColumName='CourtSumary', test_size=0.33, random_state=1000, server='', db='', us='', pw='', plot=False, sql = None):
        self.yColumName = yColumName
        self.test_size = test_size
        self.random_state = random_state
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_teste = None
        self.server = server
        self.db = db
        self.us = us
        self.pw = pw
        self.plot = plot
        self.sql = sql
        
    def InitializeDatasets(self):
        dPrepare = DFPrepare( 
                                server=self.server, 
                                db=self.db, 
                                us=self.us, 
                                pw=self.pw, 
                                sql=self.sql, 
                                yColumName=self.yColumName)

        df = dPrepare.create_clean_df()
        
        x_cols = [a for a in df.columns if a != self.yColumName]
        
        print(self.yColumName)
        
        x = df[x_cols]
        y = df[self.yColumName]

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=self.test_size, random_state=self.random_state)
    
    
    def training(self):
        print('\r\nInicializando classificacao: '+self.yColumName)
        print('\r\nInicializando dataset')
        self.InitializeDatasets()
        
        names = ["Nearest Neighbors", #"Linear SVM", "RBF SVM", #"Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

        classifiers = [
            KNeighborsClassifier(),
            #SVC(kernel="linear", C=0.025),
            #SVC(gamma=2, C=1),
            #GaussianProcessClassifier(1.0 * RBF(1.0)),
            DecisionTreeClassifier(max_depth=5),
            RandomForestClassifier(),
            MLPClassifier(alpha=1, max_iter=1000),
            AdaBoostClassifier(),
            GaussianNB()]
        
        if(self.plot):
            figure = plt.figure(figsize=(27, 9))
        
        print('\r\nClassficando dados tratados\r\n')
        for name, clf in zip(names, classifiers):
            self.Classify(name, clf)
    
    def Classify(self, name, classifier):
        print('\r\nIniciado treinamento: '+name+'\r\n')
        classifier.fit(self.x_train, self.y_train)
        
        print('\r\nVerificando Score: '+name+'\r\n')
        score = classifier.score(self.x_test, self.y_test)
        
        print('\r\nSalvando modelo: '+name+'\r\n')
        self.save(name, score*100, classifier)
        
        title = name + ' score: ' + str(score*100) + '%'
        
        if(self.plot):
            print('\r\nIniciando plot de graficos: '+name+'\r\n')
            self.plotG(title, classifier)
        else:
            print(title)
        
    def plotG(self, title, classifier):
        fig, ax = plt.subplots()
        ax.title.set_text(title)
        ax.scatter(self.y_test, classifier.predict(self.x_test), edgecolors=(0, 0, 0))
        ax.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()], 'k--', lw=4)
        ax.set_xlabel('Measured')
        ax.set_ylabel('Predicted')
        plt.show()
    
    def save(self, name, score, classifier):
        # save the model to disk
        foldername = "models_save/"+self.yColumName+"/"
        if not os.path.exists(foldername):
            os.makedirs(foldername)
        #else:
            #shutil.rmtree(foldername)
           # os.makedirs(foldername)
            
        filename = foldername+name+'_'+str(int(score))+'_finalized_model.sav'
        joblib.dump(classifier, filename)

### Variaveis para utilizar

In [13]:
Test_size= 0.25
Server='ec2-52-44-139-108.compute-1.amazonaws.com'
Db='ddhhqf3lrh36s1'
Us='gskutxpzrujbls'
Pw='74ae60662e5109135465d0bd124188a54cfbebe2efd4077885692c6efd1b324d'
Plot=False

## Classificando natureza a partir do banco

In [14]:
YColumName= 'CourtSumary'

In [15]:
dt = DataTraining(
                    test_size= Test_size, 
                    server=Server, 
                    db=Db, 
                    us=Us, 
                    pw=Pw, 
                    plot=Plot,
                    yColumName= YColumName
                )

In [16]:
dt.training()


Inicializando classificacao: CourtSumary

Inicializando dataset

Query executada
CourtSumary

Classficando dados tratados


Iniciado treinamento: Nearest Neighbors


Verificando Score: Nearest Neighbors


Salvando modelo: Nearest Neighbors

Nearest Neighbors score: 61.45086461408689%

Iniciado treinamento: Decision Tree


Verificando Score: Decision Tree


Salvando modelo: Decision Tree

Decision Tree score: 63.39097427245888%

Iniciado treinamento: Random Forest


Verificando Score: Random Forest


Salvando modelo: Random Forest

Random Forest score: 65.75284690004217%

Iniciado treinamento: Neural Net


Verificando Score: Neural Net


Salvando modelo: Neural Net

Neural Net score: 61.87262758329819%

Iniciado treinamento: AdaBoost


Verificando Score: AdaBoost


Salvando modelo: AdaBoost

AdaBoost score: 59.46857865879376%

Iniciado treinamento: Naive Bayes


Verificando Score: Naive Bayes


Salvando modelo: Naive Bayes

Naive Bayes score: 60.2277520033741%
