**Dependencies**

In [204]:
# !pip3 install pandas --upgrade
# !pip3 install swifter
# !pip3 install xgboost
# !pip3 install tqdm
# !pip3 install category_encoders
# !pip3 install joblib
# !pip3 install scikit-plot 
# !pip3 install catboost
# !pip3 install RegscorePy

## FUNCTIONS

In [207]:
################################ ALL IMPORTS ################################
import warnings
warnings.filterwarnings('ignore')

# FOR INIT
import pandas as pd
import numpy as np
import swifter
import math

# FOR Visualization
import matplotlib.pyplot as plt

# For Date Engineering
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import itertools
import holidays

#For Text Engineering
import spacy
from collections import Counter
from string import punctuation
from textblob import TextBlob

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
import nltk
nltk.download('wordnet')

from gensim import corpora, models

# For numeric Engineering
import time

# For Target Encoding
from category_encoders import TargetEncoder

# For feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression
from sklearn.ensemble import ExtraTreesClassifier

# POST PROCESS AND MACHINE LEARNING
from sklearn.preprocessing import PowerTransformer, LabelEncoder, MinMaxScaler, LabelBinarizer
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error,r2_score
from xgboost import XGBClassifier,XGBRegressor
from tqdm import tqdm_notebook as tqdm
import scikitplot as skplt
import joblib

# For debugging
import pdb

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thegeorgejoseph/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**INPUT**

In [210]:
def importFile(path,nrows=None):

    print('#### RUNNING WAIT ####')
    
    # IF THE EXTENSION IS CSV
    def importCsv(path):
        
        print('We have a csv file')
        try:
            df = pd.read_csv(path,low_memory=False,nrows=nrows)
            if df.shape[1] == 1:
                df = pd.read_csv(path,low_memory=False,sep=';',nrows=nrows)                
            print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
            return df       
            
        except FileNotFoundError:
            print('File not found, Check the name, path, spelling mistakes')
            error = True
            return None    
            
        except UnicodeDecodeError:
            try:
                enc = 'unicode_escape'
                df = pd.read_csv(path,encoding=enc,low_memory=False,nrows=nrows)
                print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
                return df
                
            except UnicodeDecodeError:
                try:
                    enc = 'ISO-8859-1'
                    df = pd.read_csv(path,encoding=enc,low_memory=False,nrows=nrows)
                    print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
                    return df
                except:
                    pass
          
        except:
            try:
                df= pd.read_csv(path,nrows=nrows)
                separators= ["~","!","@","#","$","%","^","&","*",":","|","/",";"]     # all possible separators
                if len(df.columns)<=3 :                                               # if separator was "," we would have more than 1 columns 
                    cols = df.columns[0]
                    possibleSep = []   
                    for i in separators:                                    # checking all the separators present in column names
                        if i in cols:
                            possibleSep.append(i)
                        
                    for j in possibleSep:                                   # iterate through possible seprators till we get the correct one
                        df_sep = pd.read_csv(path,sep=j,nrows=nrows)
                        if len(df_sep.columns)>3:
                            print('This file has {} columns and {} rows'.format(df_sep.shape[1],df_sep.shape[0]))
                            return df_sep
            except:
                try:
                    if len(pd.read_csv(path,sep=None).columns,nrows=nrows)>3  :                   # for tab ie "\" tsv files
                        df = pd.read_csv(path,sep=None,nrows=nrows)
                        print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
                        return df
                except:        
                    pass

    # IF THE EXTENSION IS JSON
    def importJSON(path):
        try:
            print('We have a JSON file')
            df = pd.read_json(path)
            print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
            return df    
        except Exception:
            try:
                df = pd.read_json(path,lines=True)
                print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
                return df
        
            except ValueError:
                print('File not found, Check the name, path, spelling mistakes')
                error = True
                return None

    def Excel_handler(dx):
        # to handel cases when some blank rows or other information above the data table gets assumed to be column name 
        if (len([col for col in dx.columns if 'Unnamed' in col]) > 0.5*dx.shape[1]  ):#Checking for unnamed columns 
            colNew = dx.loc[0].values.tolist()           # Getting the values in the first row of the dataframe into a list
            dx.columns = colNew                          #Making values stored in colNew as the new column names
            dx = dx.drop(labels=[0])                     #dropping the row whose values we made as the column names
            dx.reset_index(drop=True, inplace=True)      #resetting index to the normal pattern 0,1,2,3...
        else:
            return dx
            
        new_column_names=dx.columns.values.tolist() # Following three lines of code are for counting the number of null values in our new set of column names
        new_column_names=pd.DataFrame(new_column_names)
        null_value_sum=new_column_names.isnull().sum()[0]
        if null_value_sum<0.5*dx.shape[1]: # if count of null values are less than a certain ratio of total no of columns
            return dx
        while(null_value_sum>=0.5*dx.shape[1]): 
            colNew = dx.loc[0].values.tolist()
            dx.columns = colNew
            dx = dx.drop(labels=[0])
            dx.reset_index(drop=True, inplace=True)
            new_column_names=dx.columns.values.tolist() 
            new_column_names=pd.DataFrame(new_column_names)
            null_value_sum=new_column_names.isnull().sum()[0]
        return dx 

    # IF THE EXTENSION IS XL
    def importExcel(path):
        try:
            print('We have an Excel file')
            df = pd.read_excel(path, sheet_name=None,nrows=nrows)
            if len(df.keys())==1 :                               # checking if number of sheets is 1
                df = Excel_handler(df[list(df.keys())[0]])
                print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
                return df
            else:                                                 # when more than 1 sheets, asking the user for data sheet name/number
                print("Following are the sheets in the Excel file:")
                for c in range(len(df.keys())):
                    print(str(c)+".",list(df.keys())[c])
                sheet = input("Type the sheet name:  ")
                keys = [x.lower() for x in list(df.keys())]
                try:
                    index = keys.index(sheet.lower())
                    df = Excel_handler(df[list(df.keys())[index]])
                    print('This sheet {} has {} columns and {} rows'.format(sheet,df.shape[1],df.shape[0]))
                    return df
                except:
                    print('Sheet not found, Check the name, path, spelling mistakes')
                    error = True
                    return None
        except FileNotFoundError:
            print('File not found, Check the name, path, spelling mistakes')
            error = True
            return None


    def importTable(path):
        try:
            print('We have General Table File')
            df = pd.read_table(path,nrows=nrows)
            if df.shape[1] == 1:
                df = pd.read_table(path,sep=',',nrows=nrows)
            print('This file has {} columns and {} rows'.format(df.shape[1],df.shape[0]))
            return df
        except FileNotFoundError:
            print('File not found, Check the name, path, spelling mistakes')
            error = True
            return None
            
            
    try:
        ext = path.split('.')[1].lower()    
        if ext == 'csv' or ext == 'tsv':
            df = importCsv(path)
            return df
        elif ext == 'json':
            df = importJSON(path)
            return df
        elif 'xl' in ext:
            df = importExcel(path)
            return df
        elif ext == 'data':
            df = importTable(path)
            return df
        else:
            print('File format not supported\n')
    except:
        print('Extension NOT FOUND!')

**GETTING TARGET**

In [213]:
def getTarget(columns):
  
    print('\nEnter \'quit\' to quit')
    target = input('What would you like to predict? : ')
    if target == 'quit':
        return None                    
    elif target in columns:
        print('Target Spotted!')
        return target
    else:
        print('Target {} Not found in the data'.format(target))
        return None

**KEY**

In [216]:
def getKey(columns):
    print('\nEnter \'quit\' to quit')
    key = input('Enter the Key/Identification Column : ')
    if key == 'quit':
        return None,False              
    elif key in columns.values:
        print('Key Spotted!')
        return key
    else:
        print('Key {} Not found in the data'.format(key))
        print('Preview can\'t be shown!!')
        return None

### Find Key if user didn't specify key from first column alone

In [219]:
def findKey(column):
    if 'id' in column.lower():
        dec = input("Is the column \'{}\' an identification column? If yes, enter y : ".format(column))
        if dec == 'y':
            print('Identification column obtained')
            return column
        else:
            print('Identification column not obtained/found')
            return None

**USER SPECIFIED**

In [222]:
def removeUserSpecifiedIDs(df,successiveTarget=False):
    removed_cols = set()
    not_found_cols = set()
    if not successiveTarget:
        print('Would you like to remove any other ID,zip Code,Phone Numbers,UNIQUE lists, ')
        print('Or columns that have only one unique entry? If yes, enter the column names below ')
    else:
        print('Do you think you have Successive Targets based on the current target? If yes, enter the column names below ')
    print('in this format separated by commas: col1,col2,col3')
    cols = input()
    if not cols:
        print('No Columns removed')
        return df
    else:
        try:
            columns = cols.split(',')
            for column in columns:
                if column in df.columns:
                    df.drop(column,axis=1,inplace=True)
                    removed_cols.add(column)
                else:
                    not_found_cols.add(column)
            if removed_cols:
                print('\n{} columns are removed as entered by the user'.format(len(removed_cols)))
            if not_found_cols:
                print('\n{}'.format(not_found_cols))
                print('These columns were not found, hence not removed')
            return df
        except:
            print('Invalid Entry of columns! No Columns removed')
            return df

**Identify Date Columns**

In [225]:
######################## ----------------- DATE IDENTIFICATION --------------------- ######################

# Global List of all months
months = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']

def getDateColumns(df,withPossibilies=0):
    '''
    This method Identifies all columns with 'DATE' data by maximizing out the possibilities
    '''
    # First get all non-numerical Columns
    non_numeric_cols = df.select_dtypes('object')
    # This dictionary stores possibilities of a column containing 'DATES' based on the name of the column
    Possibility = {}
    for column in non_numeric_cols:
        if 'date' in column.lower():                  
            Possibility[column] = int(len(df)*0.1)
        else:
            Possibility[column] = 0
        for entry in df[column]:                                                    # ITERATE THROUGH EVERY ENTRY AND TRY SPLITTING THE VALUE AND INCREMENT/DECREMENT POSSIBILITY 
            try:                                                                      # USING EXCEPTION HANDLING
                if len(entry.split('/')) == 3 or len(entry.split('-')) == 3 or len(entry.split(':')) == 3:
                    Possibility[column] += 1
                    for month in months:
                        if month in entry.lower():
                            Possibility[column] += 1 
                else:
                    Possibility[column] -= 1
            except:
                Possibility[column] -= 1        
      # This contains the final DATE Columns
    DATE_COLUMNS = []
    for key,value in Possibility.items():             
        if value > 0.8 * len(df):                                                  # IF THE POSSIBILITY OF THE COLUMN IN GREATER THAN 1, THEN IT IS DEFINITELY A 'DATE COLUMN'
            DATE_COLUMNS.append(key)    
    if not withPossibilies:
        return DATE_COLUMNS
    else:
        return DATE_COLUMNS,Possibility

**Date Engineering**

In [227]:
def date_engineering(df):
    start = time.time()
    date_cols = df.columns
    print('\n\t Entering Date Engineering')
    df = df.swifter.apply(pd.to_datetime)
    df.fillna(pd.datetime.now(),inplace=True)
    
    # creating separate month and year columns
    for i in date_cols:
        df[str(i)+"_month"] = df[str(i)].dt.month.astype(int)
        df[str(i)+"_year"] = df[str(i)].dt.year.astype(int)
    
    # create difference columns
    if (len(date_cols)>1) :
        for i in itertools.combinations(date_cols,2):
            df[str(i[0])+" - "+str(i[1])]=(df[i[0]]-df[i[1]]).dt.days.astype(int)
    
    # create most recent
    for i in date_cols:
        df[str(i)+"-most_recent"] = (max(df[str(i)])-df[str(i)]).dt.days.astype(int)
    
    print('\n\t #### RUNNING WAIT ####')
    
    # See Near Holiday or not 
    def nearHol(currentDate, us_hols, currentYear):
        new_list = []
        append = new_list.append
        for date, occasion in us_hols:
            if(date.year == currentYear):
                append(date)
        flag = 1
        for i in new_list:
            a = (currentDate.date()-i).days

            if abs(a)<=5:flag =1;break
            else:flag = 0
                
        return 0 if flag == 0 else 1
        
    for col in date_cols:
#         print('LOOP')
        #creating a unique list of all years corresponding to a column to minimise mapping
        us_hols = holidays.US(years=df[str(col)+'_year'].unique(), expand= False) 
        #creating a new columns to check if a date falls near a holiday
        df[str(col)+'_Holiday'] = df.apply(lambda x: nearHol(x[col],us_hols.items(),x[str(col)+'_year']),axis=1) 
    
    end = time.time()
    print('\nDate Engineering Time Taken : {}'.format(end-start))
    print('\n\t #### DONE ####')
    return df.drop(date_cols,axis=1)

### Text Engineering

In [229]:
def findReviewColumns(df): #input main dataframe 
  
  rf = df.sample(n=150, random_state=1).dropna(axis=0) if len(df)>150 else df.dropna(axis=0)#use frac=0.25 to get 25% of the data
  
  #df.dropna(axis=0,inplace=True) #dropping all rows with null values

  

  #categorical_variables = []
  col_list =[]
  for col in rf.columns:
    if df[col].nunique() <100:
      col_list.append(col)           #define threshold for removing unique values #replace with variable threshold
      rf.drop(col, axis=1,inplace=True) #here df contains object columns, no null rows, no string-categorical,

  
  rf.reset_index(drop=True,inplace=True)
  for col in rf.columns:
        count1,count2,count3,count4 = 0,0,0,0
        for i in range(len(rf)):
            val = len(str(rf.at[i,col]).split())
            if val == 1:
                count1 = count1+1
            elif val == 2:
                count2 = count2+1
            elif val == 3:
                count3 = count3+1
            elif val == 4:
                count4 = count4+1
        print(col,"count of words is",count1,"-",count2,"-",count3,"-",count4,"-")
        
        if count1+count2+count3+count4 >=0.75*len(rf):
            col_list.append(col)
            print("dropping column",col)
            rf.drop(col, axis=1,inplace=True)
        
  
         

 
  start = time.time()
  print(rf.shape)
  nlp = spacy.load('en_core_web_sm', disable=['tagger','parser','textcat'])
  sf = pd.DataFrame()
  for col in rf.columns:
    sf[col] = rf[col].apply(nlp)


  end = time.time()
  print("Time taken to tokenize the DataFrame",end - start)

  #print("Tokenised Sampled DataFrame",sf)
  #print("Sampled DataFrame",rf)
  #print("Actual Dataframe",df)

  start = time.time()
  #testf = sf.sample(frac=0.10,random_state=44)
  
  #code to eliminate columns of name, city, address
  for col in sf.columns:
    entity_list =[]
    tokens = nlp(''.join(str(sf[col].tolist()))) #converting one column into tokens
    #print("the tokens of each column are:", tokens)
    token_len = sum(1 for x in tokens.ents)
    print("Length of token entities",token_len)                                    #create two lists that hold the value of actual token entities and matched token entities respectively
    if token_len>0:
      for ent in tokens.ents:
        if (ent.label_ == 'GPE') or (ent.label_ =='PERSON'):  #matching is done on the basis of whether the entity label is 
          entity_list.append(ent.text)          #countries, cities, state, person (includes fictional), nationalities, religious groups, buildings, airports, highways, bridges, companies, agencies, institutes, DATE etc.

      entity_counter = Counter(entity_list).elements()  #counts the match
      counter_length = sum(1 for x in entity_counter) 
      print("Length of matched entities",counter_length) #if there is at least a 50% match, we drop that column TLDR works better on large corpus
      if (counter_length >= 0.60*token_len):
        col_list.append(col)
    else:
      print("Length of token entities 0")
      print("Length of matched entities 0")
    counter_length = 0
    token_len = 0
  

  print("Columns that are going to be removed are ", col_list)   #list of columns that need to be removed
  ##########IMPORTANT LINE NEXT###############
  rf = df.copy() #unhide this to immediately work with the entire dataset and not just sampled dataset and vice-versa to work with sampled
  ##########DO NOT IGNORE ABOVE LINE##########
  for val in col_list:
    rf.drop(val, axis=1, inplace=True) 
  end = time.time()
  print("Time taken for completion of excess column removal:", end-start)

  if (len(rf.columns) ==0):
    print("No Remarks or Comments Found ")
    flag = 0
    return None, None
  else:
    flag = 1

  if (flag == 1):
    main_list = [] #holds all the review columns
    append = main_list.append
    for col in rf.columns:
      append(col)
    
    return main_list, col_list

In [231]:
def sentiment_analysis(rf):
  bf = pd.DataFrame()
  def getSubjectivity(text):
    try:
        return TextBlob(text).sentiment.subjectivity #returns subjectivity of the text
    except:
        return None
    
  def getPolarity(text):
    try:
        return TextBlob(text).sentiment.polarity  #returns polarity of the sentiment
    except:
        return None


  for col in rf.columns:      #creating a new DataFrame with new columns
    col_pname = "{}-{}".format(col,"Polarity")
    col_sname = "{}-{}".format(col,"Subjectivity")
    bf[col_pname] = rf[col].apply(getPolarity)
    bf[col_sname] = rf[col].apply(getSubjectivity)
   
  
  
  return bf

In [232]:
stemmer = SnowballStemmer('english')

In [233]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) #performs lemmatization
    
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:  #removes stopwords and tokens with len>3
            result.append(lemmatize_stemming(token))
    return result

In [234]:
def topicExtraction(df,validation=False,lda_model_tfidf=None):

  data_text = df.copy()
  data_text['index'] = data_text.index
  documents = data_text

  headline = list(documents.columns)[0] #review column

  processed_docs = documents[headline].map(preprocess) #preprocessing review column

  #print("Processed Docs are as follows",processed_docs[:10])

  dictionary = gensim.corpora.Dictionary(processed_docs) #converting into gensim dict
  dictionary.filter_extremes(no_below=10,no_above=0.25, keep_n=1000)   #taking most frequent tokens

  bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #document to bag of words
 
  if validation==False:
    #print("BOW Corpus", bow_corpus[:10])
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus] #generating the TF-IDF of the corpus 

    start = time.time()
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=1, workers=10) #multiprocessing Latent Dirilichtion Allocation Model
    end = time.time()
    print(end-start)
    for idx, topic in lda_model_tfidf.print_topics(-1): 
        print('Topic: {} Word: {}'.format(idx, topic)) #printing topics in the corpus


  ser = []
  append = ser.append
  print("Bag of Words Corpus length",len(bow_corpus))
  start = time.time()
  for i in range(len(bow_corpus)):
    for idx, topic in sorted(lda_model_tfidf[bow_corpus[i]], key= lambda tup: -1*tup[1]):
      append(idx)
      break
  end = time.time()
  asf = pd.DataFrame(ser)
  print("Time for append", end-start)  
  

  return asf, lda_model_tfidf
  




**Numeric Engineering**

In [235]:
# Numeric Engineering 1(To be tested)
#For converting allnumeric data in columns like currency remperature, numbers in numeric form etc.. into numeric form
def numeric_engineering(df):
    start = time.time()  
    
    def returnMoney(col):
        # Remove Commas from currencies
        try:
            return pd.to_numeric(col.str.replace(',',''))
        except:
            return col
        
    obj_columns= list(df.dtypes[df.dtypes == np.object].index)
    # print(f'object type columns are {obj_columns}') 
    print(f'\t\t stripping spaces, symbols, and lower casing all entries')
    df[obj_columns]=df[obj_columns].swifter.apply(lambda x: x.astype(str).str.strip(' %$€£¥').str.lower())
    print('done ...')
    print(f'\t\t Replacing empty and invalid strings')
    df[obj_columns]=df[obj_columns].replace(['-','n/a','na','nan','nil',np.inf,-np.inf],[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan])
    print('done ...')
    print(f'\t\t Replacing commas if present in Currencies')
    df[obj_columns]=df[obj_columns].swifter.apply(lambda x:returnMoney(x))
    print('done ...')
    obj_columns= list(df.dtypes[df.dtypes == np.object].index)
    df1 = df[obj_columns].copy()
    print(f'\t\t Finding Numeric Columns')
    df1 = df1.swifter.apply(lambda x : pd.to_numeric(x,errors='coerce'))
    df1.dropna(axis=1,thresh = 0.65*len(df),inplace=True)
    new_num_cols = df1.columns
    df[new_num_cols] = df[new_num_cols].swifter.apply(lambda x : pd.to_numeric(x,errors='coerce'))
    print('done ...')

    for i in df.columns :
        print(f'\t\t   {i} is of type {df[i].dtypes}')

    # # End of Testing codes
    end = time.time();print('Numeric Engineering time taken:',end - start);print('\n')
    return(df)

## Segregation

In [236]:
def Segregation(df):
    print('\n#### Entering Segregation ####')
    start = time.time()
    num = df._get_numeric_data().columns
    obj = list(set(df.columns)-set(num))
    
    nu = df[num].nunique()>5
    numeric = df[nu[nu == True].index]   
    cat_num = df[list(set(num) - set(numeric.columns))]
    numeric.fillna(numeric.mean(),inplace=True)
    cat_num.fillna('missing',inplace=True)
    
    unique = []
    discrete = []
    
    def func(column):
        l=column.value_counts(normalize=True)
        minor=l[l<=0.005].index
        if len(minor) > 0:
            print('\n{} contains {} categories that is/are less than 0.5 percent'.format(column.name, len(minor)))  
            if (column.nunique() - len(minor)) in range(1,61):
                discrete.append(column.name)
                column.replace(minor,'others',inplace=True)
            else:
                unique.append(column.name)
        else:
            discrete.append(column.name)
    
    df[obj].apply(func)
    
    if None in discrete:
        discrete = []
        df[discrete].fillna('missing',inplace=True)
    else:
        df[discrete].fillna('missing',inplace=True)
    
    print('\n Grouped Minor Levels and imputed')
    print('\n The useless columns are {}'.format(unique))
    end = time.time()
    print('Segregation time taken : {}'.format(end-start))
    return numeric,pd.concat([cat_num,df[discrete]],axis=1),unique

**Dataset Selection**

In [237]:
def DatasetSelection(X,Y):
  X1=X.copy()
  X2=X.copy()
  index=list(X.index)
  #Row then column
  X1.dropna(axis=0,thresh=0.5*len(X1.columns),inplace=True)#dropping the rows with many null values
  index1=list(X1.index)#storing the indices of the dataframe after the operation in index1
  X1.dropna(axis=1,thresh=0.5*len(X1),inplace=True)#dropping columns
  if len(X1.columns)==0:#in case if all columns get dropped then in result there should be no rows in the dataframe
    index1=[] #in this case list of row indices equal to null list
  Rowsdrop1=(list(set(index)-set(index1)))#storing the indices of the rows getting dropped above
  #column then row
  X2.dropna(axis=1,thresh=0.5*len(X2),inplace=True)#dropping the columns with many null values
  X2.dropna(axis=0,thresh=0.5*len(X2.columns),inplace=True)#dropping rows
  index2=list(X2.index)#storing its indices in a list
  if len(X2.columns)==0:
    index2=[]
  Rowsdrop2=(list(set(index)-set(index2)))#storing the indices of the rows getting dropped above
  if len(Rowsdrop1)<len(Rowsdrop2): #checking in which case is number of rows getting dropped is lesser
    Y.drop(Rowsdrop1,inplace=True)
    print("Columns are getting dropped first then columns")
    print("The columns getting dropped are {}".format(list(set(X.columns)-set(X1.columns))))
    print("Shape of the dataframe: {}".format(X1.shape))
    print("Shape of the target column {}".format(Y.shape))
    return X1,Y #returns resultant dataframe and target column
  else:
    Y.drop(Rowsdrop2,inplace=True)  
    print("Rows are getting dropped first then rows")
    print("The columns getting dropped are {}".format(list(set(X.columns)-set(X2.columns))))
    print("Shape of the dataframe: {}".format(X2.shape))
    print("Shape of the target column {}".format(Y.shape))
    return X2,Y

**TARGET ANALYSIS**

In [238]:
def targetAnalysis(df):
    print('\n### TARGET ANALYSIS ENTERED ###')
    Type = str(df.dtypes)
    # IF INT OR FLOAT IN TARGET, and IF NUMBER OF UNIQUE IS LESS, CLASSIFICATION, ELSE, REGRESSION
    print('Target has {} unique values'.format(df.nunique()))
    if ('int' in Type) or ('float' in Type):
        if df.nunique() < 5:
            return 'Classification'
        else:
            return 'Regression'
        
    else:
        if df.nunique() < 5:
            return 'Classification'
        else:
            return None

**Sample Equation**

In [239]:
def SampleEquation(X,Y,class_or_Reg):
    if class_or_Reg == 'Classification':# for classification
        from sklearn.linear_model import LogisticRegression
        model=LogisticRegression(max_iter=400)
        kb = SelectKBest( score_func=f_classif,k=8) #for selecting the 8 best features
        if len(X.columns)>8:#to limit the size of equation. restricting to be less than 9 variables 
            kb.fit_transform(X,Y) 
            new_features = []
            mask=kb.get_support() #This returns an array with true /false values with true for those columns which got selected
            for bool, feature in zip(mask,X.columns):#to extract column names from mask
                if bool:
                    new_features.append(feature)
            X=X[new_features]

        model.fit(X,Y)
        if Y.nunique()==2: #if there are only two classes
            for i in range(len(model.coef_)): # for dispaying the equation curresponding to all classes
                s=""
                for j in range(len(model.coef_[i])):
                    s=s+str(model.coef_[i][j])+"*"+X.columns[j]+" + "
                s=s+str(model.intercept_[i])

                print("Power term = "+s+"\n")
                print("Probability(Y=1) = exp(Power term)/(exp(Power term) + 1)\n")
        else:#multiclass classification
            for i in range(len(model.coef_)): # for dispaying the equation curresponding to all classes
                s=""
                for j in range(len(model.coef_[i])):
                    s=s+str((model.coef_[i][j]))+"*"+X.columns[j]+" + "
                s=s+str(model.intercept_[i])

                print("Prediction of class "+ str(model.classes_[i])+"\n\n")
                print("Power term= " + s)
                print("\nPrediction(class={}) = exp(Power term)/(exp(Power term) + 1)\n".format(model.classes_[i]))
    else:#regression problem
        from mlxtend.feature_selection import SequentialFeatureSelector as SFS
        from sklearn.linear_model import LinearRegression
        model=LinearRegression()
        if len(X.columns)>8:#Executing forward feature selection
            sfs = SFS(model,
               k_features=8,
               forward=True,
               floating=False,
               scoring = 'r2',
               cv = 0)
            sfs.fit(X,Y)
            X=X[list(sfs.k_feature_names_)] 
        model.fit(X,Y)
        coeff=model.coef_
        equation=""
        for i in range(len(coeff)):
            equation= equation+str(coeff[i])+"*"+X.columns[i]+" + "
        equation=equation+str(model.intercept_)

        print('Linear Equation is : {}'.format(equation))

## Feature Selection

In [240]:
def FeatureSelection(X,y,class_or_Reg):
    n_est = 10
    if class_or_Reg == 'Classification': 
        selector = XGBClassifier(n_estimators =n_est, max_depth= 6, n_jobs=-1)
        print('runnning classifier selector')
    else : 
        selector = XGBRegressor(n_estimators =n_est, max_depth= 6, n_jobs=-1)
        print('runnning regressor selector')
    
    for i in tqdm(range(10)):
        selector.fit(X, y) 
    
    # all columns container
    cols = pd.DataFrame(X.columns)

    # Getting importance scores of all the features
    k = selector.feature_importances_
    k = k.reshape(X.shape[1],1)
    k = pd.DataFrame(k)

    # threshold one(This thres is able to select only top best features which are very few)
    thresh1 = k.mean(); l = k>thresh1
    sheet1 = pd.concat([cols, k, l], axis =1)
    sheet1.columns = ['col_name','scores1','t/f']
    new_1 = sheet1.loc[(sheet1['t/f'] == False)] 

    # threshold two(The mean of the remaining features is used as a thres)
    thresh2 = new_1['scores1'].mean(); l2 = k>thresh2
    sheet2 = pd.concat([cols, k, l2], axis =1)
    sheet2.columns = ['col_name','scores2','t/f']
    new_2 = sheet2.loc[(sheet2['t/f'] == True)]

    # Final Score Sheet
    new_2 = new_2.sort_values('scores2', ascending=False)
    print('\nThe final score sheet of {} selected columns with importances:\n' .format(new_2.shape[0]))
    print(new_2)
    
    rejected_cols = set(X.columns) - set(new_2.col_name)
    print('\n{} columns are eliminated during Feature Selection which are:\n{}' .format(len(rejected_cols), rejected_cols))
    return list(rejected_cols),new_2.drop(['t/f'],axis=1)

## User Interact Visualization

In [241]:
def bivar_ploter(df1,targ,base_var,ax1):    
      l=[]
      for b in set(df1[targ]):l.append((df1[df1[targ]==b].groupby(base_var).count()[targ]).rename(b))
      c=pd.concat(l,axis=1)
      if(df1[targ].nunique()>5):
          a=list(c.sum(axis=0).sort_values(ascending=False)[:4].index)
          c=pd.concat([c[a],pd.Series(c[list(set(c.columns)-set(a))].sum(axis=1),name='Others')],axis=1)
      if(df1[base_var].dtype==np.object or df1[base_var].nunique()/len(df1)>0.1):
          if(df1[base_var].nunique()<10):a=c.plot(kind='bar',ax=ax1)
          else:a=c.loc[list(c.sum(axis=1).sort_values().index)[-10:]].plot(kind='bar',ax=ax1)
          ax1.set_title(base_var)
      else:
          a=c.plot(kind='line',alpha=0.5,ax=ax1)
      ax1.set_ylabel('Frequency')
      return a

In [242]:
def userInteractVisualization(df1,targ):
        B=list(df1.columns);B.remove(targ);l=[]
        x=df1.apply(lambda x:np.sum(x.value_counts(normalize=True).iloc[:min(10,x.nunique())])<0.10)
        if(df1[targ].nunique()>4 and df1[targ].dtype!=np.object):j=np.sum(df1.dtypes==np.object)-np.sum(x)
        else:j=len(df1.columns)-np.sum(x & df1.dtypes==np.object)-1
        nr=int((j/4)+0.99)
        print('\t Applying bivar_plotting to create Images ...') # For Testing
        start = time.time() 
        fig, axes = plt.subplots(ncols=4,nrows=nr,figsize=(20,6*nr));axes=axes.ravel();i=0
        if(df1[targ].nunique()>5 and df1[targ].dtype!=np.object):        
            for c in (df1.dtypes.loc[(df1.dtypes==np.object).values].index):
                #Plots for cat features done if top 10 unique_values account for >10% of data (else visulaisation is significant)
                if(np.sum(df1[c].value_counts(normalize=True).iloc[:min(10,df1[c].nunique())])<0.10):continue
                try:
                    bivar_ploter(df1,c,targ,axes[i]);i=i+1
                except:
                    pass
        else:    
            for c in B:
                #Plots for cat features done if top 10 unique_values account for >10% of data (else visulaisation is significant)
                if(np.sum(df1[c].value_counts(normalize=True).iloc[:min(10,df1[c].nunique())])<0.10 and df1[c].dtype==np.object):continue
                try:
                    bivar_ploter(df1,targ,c,axes[i]);i=i+1
                except:
                    pass
        for c in range(i,(4*nr)):axes[c].set_visible(False)
        print('\n Target analysis');fig.suptitle(targ);fig.tight_layout();fig.show()
        print(f'\t Done with Bivar plotting in time {time.time() - start} seconds ')

# CLASSIFICATION

In [243]:
import pandas as pd
import os 
import numpy as np
import random
from pprint import pprint
from itertools import combinations
import ast # ast.literal_eval(str(best))
from time import process_time 
import time
from decimal import Decimal

# Model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

#Hyperopt
import hyperopt
from hyperopt import *
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

#sklearn library
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import matthews_corrcoef
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import classification_report
from sklearn.utils import compute_sample_weight

import xgboost as xgb
from scipy.stats import ks_2samp

class classification:
    
  #This funciton takes input of training and testing datasets and give out the best model's Name, model with best parameter(can be used directly to score data using 'predcit' function), accuracy on the test dataset and parameters (not usefful)
  ###############################################################################################################################
  def best_model_class(self,X_train ,X_test, y_train, y_test,priorList,q_s,MAX_EVALS=15,CV=5):
      df=pd.DataFrame()
      print("Q_S value passed is!!!!",q_s)
      class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train))

      class_w= pd.Series(class_weights,index=np.unique(y_train))
      w_array = np.ones(y_train.shape[0], dtype = 'float')
      for i,val in enumerate(y_train):          #create a weight array to enter the booster
        w_array[i] = class_w[val]
     
      maxval = priorList.max()
      print(maxval)
      minval = priorList.min()
      print(minval)
      myval = math.ceil(maxval/minval)
      print(myval)
      
      print("PRIOR LIST IS",priorList)
      flag = 1
      check = 1
      if len(priorList) == 2:
        check =1  #binary classification problem
        for val in priorList:
          if val <= 0.25:
            flag = 0
            check =1  #binary classification problem
      elif len(priorList) >2:
        check =0 #multiclassification problem
        for val in priorList:
          if val <= 0.15:
            flag = 0
            
      if q_s ==True:  #QUICK RESULTS
        ind=0
        best = {}
        #XGBoost
        #######################################################################
        df.loc[ind,'Name']='XGBoost'
        if check == 1:
            df.loc[ind,'model']=xgb.XGBClassifier(n_estimators=100,eta= 0.1,max_depth=16,min_child_weight=2,gamma=5,subsample=0.1,scale_pos_weight=1,eval_metric='logloss')
        elif check ==0:
            df.loc[ind,'model']=xgb.XGBClassifier(n_estimators=100,eta= 0.1,max_depth=16,min_child_weight=2,gamma=5,subsample=0.1,objective="multi:softmax",scale_pos_weight=1,eval_metric='mlogloss',num_class=len(priorList))
            
        df.loc[ind,'param']=str(best)
        Start=time.time()
        eval_set = [(X_test, y_test)]
        if check ==1:
            df.loc[ind,'model'].fit(X_train, y_train, eval_metric="logloss", eval_set=eval_set,verbose=False)
        elif check==0:
            df.loc[ind,'model'].fit(X_train, y_train,sample_weight = w_array, eval_metric="mlogloss", eval_set=eval_set,verbose=False)
        
        xgb_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, xgb_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, xgb_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, xgb_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, xgb_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, xgb_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, xgb_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, xgb_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, xgb_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, xgb_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, xgb_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)       
        print("XGB val done")
        ind=ind+1
        ########################################################################################################
    
        ##Catboost
        ########################################################################################################
        df.loc[ind,'Name']='CatBoost'
        if check==1:
            df.loc[ind,'model']=cb.CatBoostClassifier(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,auto_class_weights="Balanced")
        elif check==0:
            df.loc[ind,'model']=cb.CatBoostClassifier(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,auto_class_weights="Balanced",loss_function='MultiClass')
        df.loc[ind,'param']=str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train,eval_set=eval_set,verbose=False)
        catboost_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, catboost_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, catboost_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, catboost_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, catboost_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, catboost_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, catboost_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_testbin, catboost_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, catboost_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, catboost_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, catboost_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("CAT val done")
        ind=ind+1
        ########################################################################################################
        
          
        ##LGBM
        ########################################################################################################
        df.loc[ind,'Name']='Light GBM'
        if check==1:
            df['model'][ind]=lgb.LGBMClassifier(boosting_type='gbdt',class_weight='balanced',learning_rate=0.1,n_estimators=100,random_state=1,subsample=1.0,num_leaves=31,max_depth=16,objective='binary')
        elif check==0:
            df['model'][ind]=lgb.LGBMClassifier(boosting_type='gbdt',class_weight='balanced',learning_rate=0.1,n_estimators=100,random_state=1,subsample=1.0,num_leaves=31,max_depth=16,objective='multiclass',num_class=len(priorList),metric='multi_logloss')
        df.loc[ind,'param']= str(best)
        Start=time.time()
        if check==1:
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="logloss", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
        elif check==0:
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="multi_logloss", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
        lightgbm_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, lightgbm_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, lightgbm_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, lightgbm_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, lightgbm_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, lightgbm_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, lightgbm_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, lightgbm_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, lightgbm_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, lightgbm_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, lightgbm_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("LGBM val done")
        ind=ind+1
        
        
        
        ##Random forest
        ########################################################################################################
        df.loc[ind,'Name']='Random Forest'
        df['model'][ind]=RandomForestClassifier(n_estimators=100,max_depth=16,class_weight='balanced')
        df.loc[ind,'param']= str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        randomforest_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, randomforest_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, randomforest_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, randomforest_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, randomforest_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, randomforest_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, randomforest_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, randomforest_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, randomforest_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, randomforest_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, randomforest_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("RF val done")
        ind=ind+1
      
        ########################################################################################################
        
        
        ##ExtraTreesClassifier(2) Finding out accuracy on the test dataset
        ########################################################################################################
        df.loc[ind,'Name']='Extra Trees Classifier'
        df['model'][ind]=ExtraTreesClassifier(n_estimators=100,max_depth=16,class_weight='balanced')
        df.loc[ind,'param']=str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        extra_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, extra_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, extra_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, extra_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, extra_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, extra_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, extra_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, extra_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, extra_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, extra_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, extra_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("ET val done")
        ind=ind+1
        #########################################################################################################
        
        #NaiveBayes
        ########################################################################################################
      
        if(flag == 1):
            best = {'priors': priorList}
            df.loc[ind,'Name']='Naive Bayes'
            df.loc[ind,'model']=GaussianNB(priors = priorList)
            df.loc[ind,'param']=str(best)
            Start=time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            naive_pred = df.loc[ind,'model'].predict(X_test)
            End=time.time()
            df.loc[ind,'accuracy']=accuracy_score(y_test, naive_pred)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, naive_pred))))
            df.loc[ind, 'Precision']=precision_score(y_test, naive_pred,average='weighted')
            df.loc[ind, 'Recall']=recall_score(y_test, naive_pred,average='weighted')
            df.loc[ind, 'F1']=f1_score(y_test, naive_pred,average='weighted')
            if check==1:
                df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, naive_pred,average='weighted')
            #elif check==0:
                #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, naive_pred,average='weighted',multi_class='ovo')
            df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, naive_pred)
            df.loc[ind, 'MCC']=matthews_corrcoef(y_test, naive_pred)
            #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, naive_pred)
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
            print("Naive Bayes done")
            ind=ind+1
        
        
        #Logistic Regression
        ##########################################################################################################
        
        df.loc[ind,'Name']='Logistic Regression'
        df.loc[ind,'model']=LogisticRegression(class_weight='balanced',solver='saga',penalty='l2',random_state=1,max_iter=1000,multi_class ='auto')
        df.loc[ind,'param']=""
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        log_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, log_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, log_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, log_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, log_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, log_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, log_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, log_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, log_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, log_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, log_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("LR val done")
        ind=ind+1
        
        
        
        
        #Neural net
        ########################################################################################################
      
        if(flag == 1):
            best={'hidden_layer_sizes':(50,),'solver':'sgd','learning_rate':'adaptive','max_iter':1000,'early_stopping':True}
            df.loc[ind,'Name']='Neural Net'
            df.loc[ind,'model']=MLPClassifier(**best)
            df.loc[ind,'param']=str(best)
            Start=time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            neural_pred = df.loc[ind,'model'].predict(X_test)
            End=time.time()
            df.loc[ind,'accuracy']=accuracy_score(y_test, neural_pred)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, neural_pred))))
            df.loc[ind, 'Precision']=precision_score(y_test, neural_pred,average='weighted')
            df.loc[ind, 'Recall']=recall_score(y_test, neural_pred,average='weighted')
            df.loc[ind, 'F1']=f1_score(y_test, neural_pred,average='weighted')
            if check==1:
                df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, neural_pred,average='weighted')
            #elif check==0:
                #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, neural_pred,average='weighted',multi_class='ovo')
            df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, neural_pred)
            df.loc[ind, 'MCC']=matthews_corrcoef(y_test, neural_pred)
            #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, neural_pred)
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
            print("NN done")
            ind=ind+1
            
        
        #SVC
        #########################################################################################################
            
        df.loc[ind,'Name']='Support Vector Machine'
        df.loc[ind,'model']= svm.SVC(kernel='linear',max_iter=1000,class_weight='balanced',probability=True,random_state=1)
        df.loc[ind,'param']= str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        support_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, support_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, support_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, support_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, support_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, support_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, support_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, support_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, support_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, support_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, support_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("SVC val done ")
        ind=ind+1
        
        
      elif q_s == False:
        ind=0 
        #XGBoost   
        #########################################################################################################################
        ##XGBoost(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
        ########################################################################################################
        def objective(params):
              print(params)
              xg = xgb.XGBClassifier(**params)
              result=cross_val_score(xg,X=X_train,y=y_train,cv=CV,scoring='accuracy',error_score=np.nan,n_jobs=6)
              print("XGB train done")
              print(result.min()*100)
              return (1-result.min())
    
        sample_weight = compute_sample_weight('balanced', y_train)   
        Space = {
              'n_estimators': 100, #scope.int(hp.quniform('n_estimators', 50,500,50)),
              'eta': hp.uniform('eta', 0.01,0.2 ),
              'max_depth': 16, #scope.int(hp.quniform('max_depth',2,16,1 )),
              'min_child_weight':  scope.int(hp.quniform('min_child_weight',1,15,1 )),
              'colsample_bytree': hp.uniform('colsample_bytree', 0.2,1.0 ),
              'gamma': scope.int(hp.quniform('gamma', 0,15,1)),
              'subsample': hp.uniform('subsample',  0.2,1.0  ),
              # 'sample_weight':sample_weight
              }
        if check ==1:
            Space['eval_metric'] = 'logloss'
            if myval >2:
                Space['scale_pos_weight'] = hp.choice('scale_pos_weight',[1,myval-1,myval,myval+1])
            else:
                Space['scale_pos_weight'] = hp.choice('scale_pos_weight',[1,myval])
        elif check==0:
            Space['eval_metric'] = 'mlogloss'
            Space['objective'] = 'multi:softmax'
            Space['num_class'] = len(priorList)

      
        bayes_trials = Trials()
        print("Moving into HyperOp")
        best = fmin(fn=objective, space = Space, algo = hyperopt.tpe.suggest,max_evals=MAX_EVALS, trials = bayes_trials)
        print("HyperOP done for XGB")
        
        best['n_estimators']=100 #int(best['n_estimators'])
        best['max_depth']=20 #int(best['max_depth'])
        best['min_child_weight']=int(best['min_child_weight'])
        best['gamma'] = int(best['gamma'])
        if check==1:
             best['eval_metric']='logloss'
        elif check==0:
             best['eval_metric']='mlogloss'
             best['objective'] = 'multi:softmax'
      
        best['subsample'] = float(best['subsample'])
        if check ==1:
            if myval >2:
                wea = [1,myval-1,myval,myval+1]
                best['scale_pos_weight'] = wea[best['scale_pos_weight']]
            else:
                wea = [1,myval]
                best['scale_pos_weight'] = wea[best['scale_pos_weight']]
        # best['sample_weight']=sample_weight
        print('XGB done')
        ########################################################################################################
      

        ##XGBoost(2) Finding out accuracy on the test dataset
        ########################################################################################################
        df.loc[ind,'Name']='XGBoost'
        df.loc[ind,'model']=xgb.XGBClassifier(**best)
        df.loc[ind,'param']=str(best)
        Start=time.time()
        eval_set = [(X_test, y_test)]
        if check ==1:
            df.loc[ind,'model'].fit(X_train, y_train, eval_metric="logloss",early_stopping_rounds=30, eval_set=eval_set,verbose=False)
        elif check==0:
            df.loc[ind,'model'].fit(X_train, y_train,sample_weight = w_array,early_stopping_rounds=30, eval_metric="mlogloss", eval_set=eval_set,verbose=False)
        
        xgb_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, xgb_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, xgb_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, xgb_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, xgb_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, xgb_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, xgb_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, xgb_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, xgb_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, xgb_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, xgb_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)       
        print("XGB val done")
        ind=ind+1
        ########################################################################################################
        
        #Catboost
        #########################################################################################################################
        df.loc[ind,'Name']='CatBoost'
        if check==1:
            df.loc[ind,'model']=cb.CatBoostClassifier(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,auto_class_weights="Balanced")
        elif check==0:
            df.loc[ind,'model']=cb.CatBoostClassifier(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,auto_class_weights="Balanced",loss_function='MultiClass')

        df.loc[ind,'param']=str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train,eval_set=eval_set,verbose=False)
        catboost_pred = df.loc[ind,'model'].predict(X_test).tolist()
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, catboost_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, catboost_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, catboost_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, catboost_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, catboost_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, catboost_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, catboost_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, catboost_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, catboost_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, catboost_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("CAT val done")
        ind=ind+1
        ########################################################################################################
        
        
        #LightGBM(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
        ########################################################################################################
        
        
        def objective(params):
            print('\n',params)
            lb = lgb.LGBMClassifier(**params)
            result = cross_val_score(lb,X=X_train,y=y_train,cv=CV,scoring='accuracy',error_score=np.nan,n_jobs=6)
            print("LGBM train done")
            print("\n",result.min()*100)
            return (1-result.min())
    
    
        Space = {
                'boosting_type': 'gbdt',
                'learning_rate': hp.uniform('learning_rate',0.01,0.2),
                'class_weight': 'balanced',
                'n_estimators': 100, #scope.int(hp.quniform('n_estimators',50,1250,75)),
                'random_state':1,
                'subsample': hp.uniform('subsample',  0.1,1.0  ),
                'num_leaves': scope.int(hp.quniform('num_leaves',29,43,1)),
                'max_depth': 16, # scope.int(hp.quniform('max_depth',2,16,1 )),
                'min_child_weight':  scope.int(hp.quniform('min_child_weight',1,16,1 ))
              }
        
        if check==1:
            Space['objective'] = 'binary'
        elif check==0:
            Space['objective'] = 'multiclass'
            Space['num_class'] = len(priorList)
            Space['metric'] = 'multi_logloss'
        
        bayes_trials = Trials()
        print("Moving into HyperOp")
        best = fmin(fn=objective, space = Space, algo = hyperopt.tpe.suggest,max_evals=MAX_EVALS, trials = bayes_trials)
        print("HyperOP done for LGBM")
      
        best['boosting_type'] = 'gbdt'
        best['learning_rate'] = float(best['learning_rate'])
        best['class_weight'] = 'balanced'
        best['n_estimators'] = 100 #int(best['n_estimators'])
        best['random_state'] = 1
        best['subsample'] = float(best['subsample'])
        best['num_leaves'] = int(best['num_leaves'])
        best['min_child_weight']=int(best['min_child_weight'])
        best['max_depth'] = 16 #int(best['max_depth'])
        if check==1:
            best['objective'] = 'binary'
        elif check==0:
            best['objective'] = 'multiclass'
            best['num_class'] = len(priorList)
            best['metric'] = 'multi_logloss'
      
        print("LGBM done")
        
        ########################################################################################################
        ##LGBM(2) Finding out accuracy on the test dataset
        ########################################################################################################
        eval_set = [(X_test, y_test)]
        df.loc[ind,'Name']='Light GBM'
        df['model'][ind]=lgb.LGBMClassifier(**best)
        df.loc[ind,'param']= str(best)
        Start=time.time()
        if check==1:
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="logloss", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
        elif check==0:
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="multi_logloss", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
        lightgbm_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, lightgbm_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, lightgbm_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, lightgbm_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, lightgbm_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, lightgbm_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, lightgbm_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, lightgbm_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, lightgbm_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, lightgbm_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, lightgbm_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("LGBM val done")
        ind=ind+1
    
      
      
        #Random Forest
        ########################################################################################################
        ##Random Forest(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
        ########################################################################################################
        def objective(params):
            print(params)
            rf = RandomForestClassifier(**params)
            result=cross_val_score(rf,X=X_train,y=y_train,cv=CV,scoring='accuracy',error_score=np.nan,n_jobs=6)
            print("RF train done")
            print(result.min()*100)
            return (1-result.min())
              
        DSpace = {
                  'n_estimators': 100, # scope.int(hp.quniform('n_estimators', 100, 1200,50)),
                  "max_depth": 16, # scope.int(hp.quniform('max_depth',2,20,1)),
                  'max_features': hp.choice('max_features',['auto', 'sqrt','log2']),
                  'min_samples_split': scope.int(hp.quniform('min_samples_split',2,15,1)),
                  'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1,20,1)),
                  'oob_score':False,
                  'bootstrap':  hp.choice('bootstrap',[True, False]),
                  'class_weight':'balanced'
                  }                                                           
      
        bayes_trials = Trials()
        print("Moving into HyperOp")
        try:
            best = fmin(fn = objective, space = DSpace, algo = hyperopt.tpe.suggest,max_evals = MAX_EVALS, trials = bayes_trials)
            print("HyperOP done for RF")
        except:
            print("Hyperparameter tuning failed")
            best.clear()
            best['class_weight']='balanced'
        else: 
            best['n_estimators']=100 #int(best['n_estimators'])
            best['max_depth']= 16 #int(best['max_depth'])
            best['min_samples_split']=int(best['min_samples_split'])
            best['min_samples_leaf']=int(best['min_samples_leaf'])
            fea=['auto', 'sqrt','log2']
            best['max_features']=fea[best['max_features']]
            best['oob_score']= False
            boot=[True, False]
            best['bootstrap']=boot[best['bootstrap']]
            best['class_weight']='balanced'
            print("HyperOP done for RF")
        

        print("RF done")
        ########################################################################################################
      
      

        ##Random forest(2) Finding out accuracy on the test dataset
        ########################################################################################################
        et_dict = best.copy()
        df.loc[ind,'Name']='Random Forest'
        df['model'][ind]=RandomForestClassifier(**best)
        df.loc[ind,'param']= str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        randomforest_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, randomforest_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, randomforest_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, randomforest_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, randomforest_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, randomforest_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, randomforest_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, randomforest_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, randomforest_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, randomforest_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, randomforest_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("RF val done")
        ind=ind+1
      
        ########################################################################################################
        
        #ExtraTreesClassifier
        ########################################################################################################
        df.loc[ind,'Name']='Extra Trees Classifier'
        df['model'][ind]=ExtraTreesClassifier(**et_dict)
        df.loc[ind,'param']=str(et_dict)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        extra_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, extra_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, extra_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, extra_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, extra_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, extra_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, extra_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, extra_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, extra_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, extra_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, extra_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("ET val done")
        ind=ind+1
        #########################################################################################################
      
        #NaiveBayes
        ########################################################################################################
      
        if(flag == 1):
            best = {'priors': priorList}
            df.loc[ind,'Name']='Naive Bayes'
            df.loc[ind,'model']=GaussianNB(priors = priorList)
            df.loc[ind,'param']=str(best)
            Start=time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            naive_pred = df.loc[ind,'model'].predict(X_test)
            End=time.time()
            df.loc[ind,'accuracy']=accuracy_score(y_test, naive_pred)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, naive_pred))))
            df.loc[ind, 'Precision']=precision_score(y_test, naive_pred,average='weighted')
            df.loc[ind, 'Recall']=recall_score(y_test, naive_pred,average='weighted')
            df.loc[ind, 'F1']=f1_score(y_test, naive_pred,average='weighted')
            if check==1:
                df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, naive_pred,average='weighted')
            #elif check==0:
                #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, naive_pred,average='weighted',multi_class='ovo')
            df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, naive_pred)
            df.loc[ind, 'MCC']=matthews_corrcoef(y_test, naive_pred)
            #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, naive_pred)
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
            print("Naive Bayes done")
            ind=ind+1
      
    
        #Logistic regression
        ########################################################################################################
        df.loc[ind,'Name']='Logistic Regression'
        df.loc[ind,'model']=LogisticRegression(class_weight='balanced',solver='saga',penalty='l2',random_state=1,max_iter=1000,multi_class ='auto')
        df.loc[ind,'param']=""
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        log_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, log_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, log_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, log_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, log_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, log_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, log_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, log_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, log_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, log_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, log_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("LR val done")
        ind=ind+1
      
        
        #Neural net
        ########################################################################################################
      
        if(flag == 1):
                best={'hidden_layer_sizes':(50,),'solver':'sgd','learning_rate':'adaptive','max_iter':1000,'early_stopping':True}
                df.loc[ind,'Name']='Neural Net'
                df.loc[ind,'model']=MLPClassifier(**best)
                df.loc[ind,'param']=str(best)
                Start=time.time()
                df.loc[ind,'model'].fit(X_train, y_train)
                neural_pred = df.loc[ind,'model'].predict(X_test)
                End=time.time()
                df.loc[ind,'accuracy']=accuracy_score(y_test, neural_pred)*100
                df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, neural_pred))))
                df.loc[ind, 'Precision']=precision_score(y_test, neural_pred,average='weighted')
                df.loc[ind, 'Recall']=recall_score(y_test, neural_pred,average='weighted')
                df.loc[ind, 'F1']=f1_score(y_test, neural_pred,average='weighted')
                if check==1:
                    df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, neural_pred,average='weighted')
                #elif check==0:
                    #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, neural_pred,average='weighted',multi_class='ovo')
                df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, neural_pred)
                df.loc[ind, 'MCC']=matthews_corrcoef(y_test, neural_pred)
                #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, neural_pred)
                df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
                print("NN done")
                ind=ind+1
    
        #Support Vector Machine(linear)
        ########################################################################################################
            
        df.loc[ind,'Name']='Support Vector Machine'
        df.loc[ind,'model']= svm.SVC(kernel='linear',max_iter=1000,class_weight='balanced',probability=True,random_state=1)
        df.loc[ind,'param']= str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        support_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=accuracy_score(y_test, support_pred)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, support_pred))))
        df.loc[ind, 'Precision']=precision_score(y_test, support_pred,average='weighted')
        df.loc[ind, 'Recall']=recall_score(y_test, support_pred,average='weighted')
        df.loc[ind, 'F1']=f1_score(y_test, support_pred,average='weighted')
        if check==1:
            df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, support_pred,average='weighted')
        #elif check==0:
            #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, support_pred,average='weighted',multi_class='ovo')
        df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, support_pred)
        df.loc[ind, 'MCC']=matthews_corrcoef(y_test, support_pred)
        #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, support_pred)
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
        print("SVC val done ")
        ind=ind+1
      
      
      
      #Ensemble
      ########################################################################################################
      ##Ensemble(1) Finding all possible combination of above model and find out the best combination based on testing data accuracy
      ########################################################################################################
      lev=len(np.unique(y_test))
      arr1=np.empty((len(y_test),lev,0))
      for i in range(0,len(df)):
          arr1=np.dstack((arr1,df.loc[i,'model'].predict_proba(X_test)))

      max_f1=0
      max_seq=0
      for i in range(2,len(df)+1):
          comb=list(combinations(enumerate(np.rollaxis(arr1,axis=2,start=0)), i))
          for j in range(0,len(comb)):
              m=np.empty((len(y_test),lev,0))
              for x in range(0,len(comb[j])):
                  m=np.dstack((m,comb[j][x][1]))
              arr=np.mean(m,axis=2)
              clas=np.argmax(arr,axis=1)
              f1=f1_score(y_test, clas,average='weighted')*100
              seq=np.array(comb[j])[:,0]
              if f1>max_f1:
                  max_f1=f1
                  max_seq=seq
                
      print("this is what you are printing",max_seq) 
      ########################################################################################################

      ##Ensemble(2) List of the best combination from the above method
      ########################################################################################################  
      name=''
      df_en=pd.DataFrame(index = range(1000), columns=['Name','model'])
      for i in range(0,len(max_seq)):
          df_en.at[i,'Name']= df.at[max_seq[i],'Name']
          val = df.at[max_seq[i],'model']
          df_en['model'][i] = val
          name=name+df['Name'][max_seq[i]]+'+'
      
      df_en.dropna(axis=0,inplace=True)
      ########################################################################################################
      
      
      ##Ensemble(3) Making an esemble model of the best combination
      ########################################################################################################
      df.loc[ind,'Name']=('Ensemble '+name)[:-1]
      df.loc[ind,'model']=VotingClassifier(df_en.values, voting='soft',n_jobs=-1)
      df.loc[ind,'param']="Default"
      Start=time.time()
      df.loc[ind,'model'].fit(X_train, y_train)
      ensemble_pred = df.loc[ind,'model'].predict(X_test)
      End=time.time()
      df.loc[ind,'accuracy']=accuracy_score(y_test, ensemble_pred)*100
      df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(accuracy_score(y_test, ensemble_pred))))
      df.loc[ind, 'Precision']=precision_score(y_test, ensemble_pred,average='weighted')
      df.loc[ind, 'Recall']=recall_score(y_test, ensemble_pred,average='weighted')
      df.loc[ind, 'F1']=f1_score(y_test, ensemble_pred,average='weighted')
      if check==1:
          df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, ensemble_pred,average='weighted')
      #elif check==0:
          #df.loc[ind, 'ROC_AUC_score']=roc_auc_score(y_test, ensemble_pred,average='weighted',multi_class='ovo')
      df.loc[ind, 'Kappa']=cohen_kappa_score(y_test, ensemble_pred)
      df.loc[ind, 'MCC']=matthews_corrcoef(y_test, ensemble_pred)
      #df.loc[ind, 'KS_statistic'],df.loc[ind, 'KS_p-value']=ks_2samp(y_test, ensemble_pred)
      df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
      ind=ind+1

      ##Best Model
      ########################################################################################################
      best_info=df.sort_values('F1',ignore_index=True,ascending=False).loc[0,:]
      best_name=best_info['Name']
      best_mod=best_info['model']
      best_acc=best_info['accuracy']
      best_param=best_info['param']
      ########################################################################################################    
    
      joblib.dump(best_info,'best_info')
        
      return best_name,best_mod, best_acc, best_param,df

# REGRESSION

In [244]:
import pandas as pd
import os 
import numpy as np
import random
from pprint import pprint
from itertools import combinations
import ast # ast.literal_eval(str(best))
from time import process_time 
import time
from math import sqrt
from decimal import Decimal

# Model
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor

#Hyperopt
import hyperopt
from hyperopt import *
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample



#sklearn library
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import classification_report

from RegscorePy import aic,bic

  ###############################################################################################################################
class Regression:

  #This funciton takes input of training and testing datasets and give out the best model's Name, model with best parameter(can be used directly to score data using 'predcit' function), accuracy on the test dataset and parameters (not usefful)
  ###############################################################################################################################
  def best_model_reg(self,X_train , X_test, y_train, y_test,q_s,MAX_EVALS=15,CV=5):
      df=pd.DataFrame()
      print("The value of Q_S is ",q_s)
      
      
      
        
      if q_s ==True:  #QUICK RESULTS
        ind=0
        best = {}
        #XGBoost
        #######################################################################
        df.loc[ind,'Name']='XGBoost'
        df.loc[ind,'model']=xgb.XGBRegressor(n_estimators=100,eta=0.1,max_depth=16,min_child_weight=2,gamma=5,subsample=0.1,objective="reg:squarederror",eval_metric='rmse')
        df.loc[ind,'param']=str(best)
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        xgb_reg_prob1 = df.loc[ind,'model'].predict(X_test).tolist()
        print(type(xgb_reg_prob1))
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, xgb_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, xgb_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, xgb_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, xgb_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, xgb_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, xgb_reg_prob1,X_train.shape[1])
        #print("aic done")
        df.loc[ind,'BIC']=bic.bic(y_test, xgb_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
      
        print("XGB Validation done")
        ind=ind+1
        ########################################################################################################
    
        ##Catboost
        ########################################################################################################
        df.loc[ind,'Name']='CatBoost'
        df.loc[ind,'model']=cb.CatBoostRegressor(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,silent=True)
        df.loc[ind,'param']=str(best)
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        cat_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, cat_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, cat_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, cat_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, cat_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, cat_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, cat_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, cat_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
        print("CAT Validation done")
        ind=ind+1
        ########################################################################################################
        
          
        ##LGBM
        ########################################################################################################
        eval_set = [(X_test, y_test)]
        df.loc[ind,'Name']='Light GBM'
        df['model'][ind]=lgb.LGBMRegressor(boosting_type='gbdt',learning_rate=0.1,n_estimators=100,random_state=1,subsample=1.0,num_leaves=31,max_depth=16)
        df.loc[ind,'param']= str(best)
        Start=time.time()
        df.loc[ind,'model'].fit(X_train, y_train,verbose=False)
        lightgbm_pred = df.loc[ind,'model'].predict(X_test)
        End=time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, cat_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, cat_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, cat_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, cat_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, cat_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, cat_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, cat_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)   
        print("LGBM val done")
        ind=ind+1
        ########################################################################################################
        
        
        ##Random forest
        ########################################################################################################
        df.loc[ind,'Name']='Random Forest'
        df['model'][ind]=RandomForestRegressor(n_estimators=100,max_depth=20)
        df.loc[ind,'param']=str(best)
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        random_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, random_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, random_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, random_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, random_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, random_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, random_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, random_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)       
        print("RF Validation done")
        ind=ind+1
      
        ########################################################################################################
        
        
        ##ExtraTreesClassifier(2) Finding out accuracy on the test dataset
        ########################################################################################################
        df.loc[ind,'Name']='ExtraTrees Regressor'
        df['model'][ind]=ExtraTreesRegressor(n_estimators=100,max_depth=20)
        df.loc[ind,'param']=str(best)
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        extra_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, extra_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, extra_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, extra_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, extra_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, extra_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, extra_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, extra_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
        print("ET Validation done")
        ind=ind+1
        #########################################################################################################
        
        
        #Linear Regression
        ##########################################################################################################
      
        df.loc[ind,'Name']='Linear Regression'
        df.loc[ind,'model']=LinearRegression()
        df.loc[ind,'param']=None
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        logr_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, logr_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, logr_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, logr_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, logr_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, logr_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, logr_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, logr_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)   
      
        print("linear reg done")
        ind=ind+1
        
        #Ridge Regression
        ##########################################################################################################
        
        df.loc[ind,'Name']='Ridge Regression'
        df.loc[ind,'model']=Ridge()
        df.loc[ind,'param']=None
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        ridge_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, ridge_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, ridge_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, ridge_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, ridge_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, ridge_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, ridge_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, ridge_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)     
        print("ridge reg done")
        ind=ind+1

        #Neural net
        ########################################################################################################
      
        best={'hidden_layer_sizes':(50,),'solver':'sgd','learning_rate':'adaptive','max_iter':1000,'early_stopping':True,'n_iter_no_change':30}
        df.loc[ind,'Name']='Neural Net'
        df.loc[ind,'model']=MLPRegressor(**best)
        df.loc[ind,'param']=str(best)
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        mlpc_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        try:
            df.loc[ind,'accuracy']=r2_score(y_test, mlpc_reg_prob1)*100
        except:
            print("Neural Net threw an error")
        else:
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, mlpc_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, mlpc_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, mlpc_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, mlpc_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, mlpc_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, mlpc_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)     

            print("neural net done") 
            ind=ind+1
        
        #SVC
        #########################################################################################################
            
        df.loc[ind,'Name']='Support Vector Machine'
        df.loc[ind,'model']=svm.SVR(kernel='linear',max_iter=1000)
        df.loc[ind,'param']=None
        Start = time.time()
        df.loc[ind,'model'].fit(X_train, y_train)
        svc_reg_prob1 = df.loc[ind,'model'].predict(X_test)
        End = time.time()
        df.loc[ind,'accuracy']=r2_score(y_test, svc_reg_prob1)*100
        df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, svc_reg_prob1))))
        df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, svc_reg_prob1))
        df.loc[ind,'MSE'] = mean_squared_error(y_test, svc_reg_prob1)
        df.loc[ind,'MAE']=mean_absolute_error(y_test, svc_reg_prob1)
        #df.loc[ind,'AIC']=aic.aic(y_test, svc_reg_prob1,X_train.shape[1])
        df.loc[ind,'BIC']=bic.bic(y_test, svc_reg_prob1,X_train.shape[1])
        df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
      
        print("SVC done")
        ind=ind+1
    
      elif q_s==False:
            ind = 0
            #XGBoost
            #########################################################################################################################
            ##XGBoost(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
            ########################################################################################################
            def objective(params):
                  print(params)
                  xg = xgb.XGBRegressor(**params)
                  result=cross_val_score(xg,X=X_train,y=y_train,cv=CV,scoring='r2',error_score=np.nan,n_jobs=6)
                  print("XGB Training Done")
                  return (1-result.min())
      
      
            Space = {
                  'n_estimators': 100, # scope.int(hp.quniform('n_estimators', 50,1250,75)),
                  'eta': hp.uniform('eta', 0.01,0.2 ),
                  'max_depth': 20, # scope.int(hp.quniform('max_depth',2,16,1 )),
                  'min_child_weight':  scope.int(hp.quniform('min_child_weight',1,15,1 )),
                  'colsample_bytree': hp.uniform('colsample_bytree', 0.2,1.0 ),
                  'gamma': scope.int(hp.quniform('gamma', 0,15,1)),
                  'eval_metric': 'rmse',
                  'objective': 'reg:linear',
                  'subsample': hp.uniform('subsample',  0.2,1.0  )
              }
      
      
            bayes_trials = Trials()
            best = fmin(fn = objective, space = Space, algo = hyperopt.tpe.suggest,max_evals=MAX_EVALS, trials = bayes_trials)
            print("XGB hyperop done")
      
      
            best['n_estimators']=100 #int(best['n_estimators'])
            best['max_depth']=20 #int(best['max_depth'])
            best['gamma'] = int(best['gamma'])
            best['subsample']= float(best['subsample'])
            best['min_child_weight']=int(best['min_child_weight'])
            best['objective']='reg:squarederror'
            best['eval_metric']='rmse'
            ########################################################################################################
      

            ##XGBoost(2) Finding out accuracy on the test dataset
            ########################################################################################################
            eval_set = [(X_test, y_test)]
            df.loc[ind,'Name']='XGBoost'
            df.loc[ind,'model']=xgb.XGBRegressor(**best)
            df.loc[ind,'param']=str(best)
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="rmse", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
            xgb_reg_prob1 = df.loc[ind,'model'].predict(X_test).tolist()
            print(type(xgb_reg_prob1))
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, xgb_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, xgb_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, xgb_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, xgb_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, xgb_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, xgb_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, xgb_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)
      
            print("XGB Validation done")
            ind=ind+1
            ########################################################################################################
      
      
            #Catboost
            #########################################################################################################################
            ##Catboost 
            ########################################################################################################
            df.loc[ind,'Name']='CatBoost'
            df.loc[ind,'model']=cb.CatBoostRegressor(depth=10,iterations=1000,learning_rate=0.1,rsm=1.0,silent=True)
            df.loc[ind,'param']=str(best)
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            cat_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, cat_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, cat_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, cat_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, cat_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, cat_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, cat_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, cat_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
            print("CAT Validation done")
            ind=ind+1
            ########################################################################################################
            
            
            #LightGBM
            ########################################################################################################
            ##LightGBM(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
            ########################################################################################################
            
            def objective(params):
                  print(params)
                  xg = lgb.LGBMRegressor(**params)
                  result=cross_val_score(xg,X=X_train,y=y_train,cv=CV,scoring='r2',error_score=np.nan,n_jobs=6)
                  print("XGB Training Done")
                  return (1-result.min())
                
            
            Space = {
                'boosting_type': 'gbdt',
                'learning_rate': hp.uniform('learning_rate',0.01,0.2),
                'n_estimators': 100, # scope.int(hp.quniform('n_estimators',50,1250,75)),
                'random_state':1,
                'subsample': hp.uniform('subsample',  0.1,1.0  ),
                'num_leaves': scope.int(hp.quniform('num_leaves',29,43,1)),
                'max_depth': 16, #scope.int(hp.quniform('max_depth',2,16,1 )),
                'min_child_weight':  scope.int(hp.quniform('min_child_weight',1,16,1 ))
              }
            
            bayes_trials = Trials()
            print("Moving into HyperOp")
            best = fmin(fn=objective, space = Space, algo = hyperopt.tpe.suggest,max_evals=MAX_EVALS, trials = bayes_trials)
            print("HyperOP done for LGBM")
            
            best['boosting_type'] = 'gbdt'
            best['learning_rate'] = float(best['learning_rate'])
            best['n_estimators'] = 100 #int(best['n_estimators'])
            best['random_state'] = 1
            best['subsample'] = float(best['subsample'])
            best['num_leaves'] = int(best['num_leaves'])
            best['min_child_weight']=int(best['min_child_weight'])
            best['max_depth'] = 16 #int(best['max_depth'])
      
            print("LGBM done")
      
            ##LightGBM(2) Finding out accuracy on the test dataset
            ########################################################################################################
            eval_set = [(X_test, y_test)]
            df.loc[ind,'Name']='Light GBM'
            df['model'][ind]=lgb.LGBMRegressor(**best)
            df.loc[ind,'param']= str(best)
            Start=time.time()
            df.loc[ind,'model'].fit(X_train, y_train,eval_metric="logloss", eval_set=eval_set,early_stopping_rounds=30,verbose=False)
            lightgbm_pred = df.loc[ind,'model'].predict(X_test)
            End=time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, cat_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, cat_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, cat_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, cat_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, cat_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, cat_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, cat_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)   
            print("LGBM val done")
            ind=ind+1 
            
      
            #Random forest
            ######################################################################################################################### 
            ##Random forest(1) Finding Best hyperparamter using Bayesian Hyperparameter Optimization
            ########################################################################################################
            def objective(params):
                  print(params)
                  rf = RandomForestRegressor(**params)
                  result=cross_val_score(rf,X=X_train,y=y_train,cv=CV,scoring='r2',error_score=np.nan,n_jobs=6)
                  print("Random Forest Training done")
                  return (1-result.min())
              
            Space = {
                      'n_estimators': 100, #scope.int(hp.quniform('n_estimators', 100,1200,50)),
                      "max_depth": 20, # scope.int(hp.quniform('max_depth',2,30,1)),
                      'max_features': hp.choice('max_features',['auto', 'sqrt','log2']),
                      'min_samples_split': scope.int(hp.quniform('min_samples_split',2,15,1)),
                      'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1,20,1)),
                      'oob_score':False,
                      'bootstrap':  hp.choice('bootstrap',[True, False])
                  }                                                           
      
            bayes_trials = Trials()
            try:
                best = fmin(fn = objective, space = Space, algo = hyperopt.tpe.suggest,max_evals = MAX_EVALS, trials = bayes_trials)
                print("HyperOP done for RF")
            except:
                print("Hyperparameter tuning failed")
                best.clear()
                best['oob_score']=False
            else: 
                best['n_estimators']=100 #int(best['n_estimators'])
                best['max_depth']= 20 #int(best['max_depth'])
                best['min_samples_split']=int(best['min_samples_split'])
                best['min_samples_leaf']=int(best['min_samples_leaf'])
                fea=['auto', 'sqrt','log2']
                best['max_features']=fea[best['max_features']]
                best['oob_score']= False
                boot=[True, False]
                best['bootstrap']=boot[best['bootstrap']]
                print("RF Hyperop done")
      
            print("RF done")
            ########################################################################################################
      

            ##Random forest(2) Finding out accuracy on the test dataset
            ########################################################################################################
            et_dict = best.copy()
            df.loc[ind,'Name']='Random Forest'
            df['model'][ind]=RandomForestRegressor(**best)
            df.loc[ind,'param']=str(best)
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            random_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, random_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, random_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, random_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, random_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, random_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, random_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, random_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)       
            print("RF Validation done")
            ind=ind+1
      
            ########################################################################################################
    
    
            #ExtraTrees Regression
            ######################################################################################################################### 
            df.loc[ind,'Name']='ExtraTrees Regressor'
            df['model'][ind]=ExtraTreesRegressor(**et_dict)
            df.loc[ind,'param']=str(best)
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            extra_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, extra_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, extra_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, extra_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, extra_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, extra_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, extra_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, extra_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
            print("ETextra_reg_prob1 Validation done")
            ind=ind+1
      
            ########################################################################################################


            #Ridge Regression
            ########################################################################################################
            df.loc[ind,'Name']='Ridge Regression'
            df.loc[ind,'model']=Ridge()
            df.loc[ind,'param']=None
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            ridge_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, ridge_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, ridge_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, ridge_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, ridge_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, ridge_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, ridge_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, ridge_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)     
            print("ridge reg done")
            ind=ind+1

            #Linear regression
            ########################################################################################################
            df.loc[ind,'Name']='Linear Regression'
            df.loc[ind,'model']=LinearRegression()
            df.loc[ind,'param']=None
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            logr_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, logr_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, logr_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, logr_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, logr_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, logr_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, logr_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, logr_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)   
      
            print("linear reg done")
            ind=ind+1
      
            #Neural net
            ########################################################################################################
            best={'hidden_layer_sizes':(50,),'solver':'sgd','learning_rate':'adaptive','max_iter':1000,'early_stopping':True,'n_iter_no_change':30}
            df.loc[ind,'Name']='Neural Net'
            df.loc[ind,'model']=MLPRegressor(**best)
            df.loc[ind,'param']=str(best)
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            mlpc_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            try:
                df.loc[ind,'accuracy']=r2_score(y_test, mlpc_reg_prob1)*100
            except:
                print("Neural Net threw an error")
            else:
                df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, mlpc_reg_prob1))))
                df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, mlpc_reg_prob1))
                df.loc[ind,'MSE'] = mean_squared_error(y_test, mlpc_reg_prob1)
                df.loc[ind,'MAE']=mean_absolute_error(y_test, mlpc_reg_prob1)
                #df.loc[ind,'AIC']=aic.aic(y_test, mlpc_reg_prob1,X_train.shape[1])
                df.loc[ind,'BIC']=bic.bic(y_test, mlpc_reg_prob1,X_train.shape[1])
                df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)     

                print("neural net done") 
                ind=ind+1
        
      
            #Support Vector Machine
            ########################################################################################################     
            df.loc[ind,'Name']='Support Vector Machine'
            df.loc[ind,'model']=svm.SVR(kernel='linear',max_iter=1000)
            df.loc[ind,'param']=None
            Start = time.time()
            df.loc[ind,'model'].fit(X_train, y_train)
            svc_reg_prob1 = df.loc[ind,'model'].predict(X_test)
            End = time.time()
            df.loc[ind,'accuracy']=r2_score(y_test, svc_reg_prob1)*100
            df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, svc_reg_prob1))))
            df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, svc_reg_prob1))
            df.loc[ind,'MSE'] = mean_squared_error(y_test, svc_reg_prob1)
            df.loc[ind,'MAE']=mean_absolute_error(y_test, svc_reg_prob1)
            #df.loc[ind,'AIC']=aic.aic(y_test, svc_reg_prob1,X_train.shape[1])
            df.loc[ind,'BIC']=bic.bic(y_test, svc_reg_prob1,X_train.shape[1])
            df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
      
            print("SVC done")
            ind=ind+1 
      
      
      
      #Ensemble
      ########################################################################################################
      ##Ensemble(1) Finding all possible combination of above model and find out the best combination based on testing data accuracy
      ########################################################################################################
      arr1=np.empty((len(y_test),0))
      for i in range(0,len(df)):
          arr1=np.hstack((arr1,np.reshape(df.loc[i,'model'].predict(X_test),(len(y_test),1))))

      min_rmse=100000
      max_seq=0
      for i in range(2,len(df)+1):
          comb=list(combinations(enumerate(arr1.T), i))
          for j in range(0,len(comb)):
              m=np.empty((len(y_test),0))
              for x in range(0,len(comb[j])):
                  m=np.hstack((m,np.reshape(comb[j][x][1],(len(y_test),1))))  
              arr=np.mean(m,axis=1)
              rmse= sqrt(mean_squared_error(y_test, arr))
              seq=np.array(comb[j])[:,0]
              if rmse<min_rmse:
                  min_rmse = rmse
                  max_seq=seq
      
      print("this is what you are printing",max_seq)
      
      ##############################################################################

      ##Ensemble(2) List of the best combination from the above method
      ########################################################################################################
      name=''
      df_en=pd.DataFrame(index = range(1000), columns=['Name','model'])
      for i in range(0,len(max_seq)):
        df_en.at[i,'Name']= df.at[max_seq[i],'Name']
        val = df.at[max_seq[i],'model']
        df_en['model'][i] = val
        name=name+df['Name'][max_seq[i]]+'+'
      
    
      df_en.dropna(axis=0,inplace=True)
      ########################################################################################################
      
      
      ##Ensemble(3) Making an esemble model of the best combination
      ########################################################################################################
      df.loc[ind,'Name']=('Ensemble '+name)[:-1]
      df.loc[ind,'model']=VotingRegressor(df_en.values,n_jobs=-1)
      df.loc[ind,'param']="Defualt"
      Start = time.time()
      df.loc[ind,'model'].fit(X_train, y_train)
      ensemble_pred = df.loc[ind,'model'].predict(X_test)
      End = time.time()
      df.loc[ind,'accuracy']=r2_score(y_test, ensemble_pred)*100
      df.loc[ind,'Accuracy%']="{:.2%}".format(Decimal(str(r2_score(y_test, ensemble_pred))))
      df.loc[ind,'RMSE']=sqrt(mean_squared_error(y_test, ensemble_pred))
      df.loc[ind,'MSE']=mean_squared_error(y_test, ensemble_pred)
      df.loc[ind,'MAE']=mean_absolute_error(y_test, ensemble_pred)
      #df.loc[ind,'AIC']=aic.aic(y_test, ensemble_pred,X_train.shape[1])
      df.loc[ind,'BIC']=bic.bic(y_test, ensemble_pred,X_train.shape[1])
      df.loc[ind,'Total time(mins)']= ((End-Start) / 60.0)      
      ind=ind+1
      
      best_info=df.sort_values('RMSE',ignore_index=True,ascending=True).loc[0,:]
      best_name=best_info['Name']
      best_mod=best_info['model']
      best_acc=best_info['accuracy']
      best_param=best_info['param']
    
      joblib.dump(best_info,'best_info')
      
      return best_name,best_mod, best_acc, best_param,df

### Visualization

In [245]:
def Visualization(X,Y,class_or_Reg):
    import pydotplus
    if class_or_Reg == 'Classification':
        from sklearn import tree
        from sklearn.tree import DecisionTreeClassifier
        import matplotlib.pyplot as plt
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        Y = le.fit_transform(Y)#encoding the target variable
        Yt=pd.DataFrame(Y)
        clf = DecisionTreeClassifier(max_depth = 3, min_samples_split=2, min_samples_leaf=0.01, random_state = 1)
        clf.fit(X, Y)
        class_names=list(le.inverse_transform(sorted(Yt[Yt.columns[0]].unique())))
        for i in range(len(class_names)):
            class_names[i]=str(class_names[i])
        print("value=[n1,n2,n3...] where n1,n2,n3 are the number of samples of the classes in the order     \nvalue="+str(le.inverse_transform(sorted(Yt[Yt.columns[0]].unique()))))
        tree.plot_tree(clf,
                     feature_names =X.columns, #the list of all column names
                     class_names=class_names, #list of the class names
                     filled = True,
                     impurity=False,
                     rounded=True,
                     fontsize=10);
        dot_data = tree.export_graphviz(clf, out_file=None, 
                            feature_names=X.columns,  
                            filled=True, impurity=False, rounded=True,  
                            special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_png('CART.png')
        graph.write_svg("CART.svg")

    else:
        from sklearn.tree import DecisionTreeRegressor
        from sklearn import tree
        import matplotlib.pyplot as plt
        clf = DecisionTreeRegressor(max_depth = 3, min_samples_split=2, min_samples_leaf=0.01, random_state = 0)
        clf.fit(X, Y)
        tree.plot_tree(clf,
                   feature_names =X.columns, 
                      filled = True,
                     impurity=False,
                     rounded=True,
                     fontsize=10);
        dot_data = tree.export_graphviz(clf, out_file=None, 
                             feature_names=X.columns,  
                                filled=True, impurity=False, rounded=True,  
                                special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_png('CART.png')
        graph.write_svg("CART.svg")

### Quicker/Slower Results for setting max evals

In [246]:
def quick_slow():
    inp = input('Do you want quick results or slower results? If quick enter y : ').lower()
    return True if inp == 'y' else False

### Get User Input

In [247]:
def getUserInput(df):
    if isinstance(df,pd.DataFrame):
        print('\nDataFrame Succesfully imported\n')
            
        print(df.columns)

        # Get Target from user
        target = getTarget(df.columns)

        if not target:
            # Quit the whole process
            print('\nQuitting Process\n')
            return None
        else: 
            # Get Key Column
            key = getKey(df.columns)
            if not key:
                key = findKey(df.columns[0])
            if key:
                df.drop(key,axis=1,inplace=True)           
                
            # Remove User Specified ID Columns
            df = removeUserSpecifiedIDs(df,True)
                
            # Remove Successive Targets
            df = removeUserSpecifiedIDs(df)
            
            # Quick/Slow results for max evals
            quick = quick_slow()
            if quick:print('DEFAULT MODELS WILL BE RUNNING')
            else:print('HYPERPARAMETER OPTIMISATION WILL BE RUN ON MODELS')
        
        info = {'target':target,'key':key,'cols':df.drop([target],axis=1).columns.to_list(),'q_s':quick}
        
        return info
    else:
        return None

### Remove Classes with 0.05% Occurence

In [248]:
def removeLowClass(targetColumn):
    vc = targetColumn.value_counts(normalize=True)<0.005
    classes = vc[vc==True].index
    if len(classes)!=0:
        print('Classes {} will be removed from the data, as they have less than 0.5% occurence'.format(classes))
        print('and create class imbalance which might affect model performances!')
        return targetColumn.drop(classes).index
    else:
        print('No Class was found less than 0.5% occurence!')
        return None

### Feature Selection Plot

In [249]:
def featureSelectionPlot(feat_df):
    import seaborn as sns
    f = 20
    plt.figure(figsize=(8,8))
    plt.title('Feature Importance Plot',fontsize=f)
    sns.barplot(x='scores2',y='col_name',data=feat_df,palette="Blues_d")
    plt.xlabel('Importance',fontsize=f)
    plt.ylabel('Feature',fontsize=f)
    plt.xticks(fontsize=12,rotation=90)
    plt.yticks(fontsize=12)
    plt.show()

**INIT DATA PREPROCESSING**

In [250]:
def INIT(df,info):
    target = info['target']
    key = info['key']
    cols = info['cols']
    cols.append(target)
    if key:
        cols.append(key)
    df = df[cols]
    # Print columns with missing data in the descending order
    MISSING = pd.DataFrame(((df.isnull().sum().sort_values(ascending=False)*100/len(df)).round(2)),columns=['Missing in %'])[:10]
    print(MISSING)
    
    ############ TARGET NUMERIC ENGINEERING ###########
    print('\n ### Entering Numeric Engineering of Target### \n')
    df = numeric_engineering(df)
    ############ TARGET NUMERIC ENGINEERING ###########

    # Find Classification or Regression
    class_or_Reg = None #INIT
    class_or_Reg = targetAnalysis(df[target])
    if not class_or_Reg:
        print('\nExecution stops as We cant deal with such a target')
        return None,None
    
    if class_or_Reg == 'Classification':
        # Remove Classes with less than 0.05% occurence
        if df[target].nunique() >2:  #removalof low class not done if binary classification
            lowClassRows = removeLowClass(df[target])
            if lowClassRows:
                if len(lowClassRows)!=0:
                    df.drop(lowClassRows,inplace=True)

                if df[target].nunique() == 1:
                    print('For Classification at least 2 classes are expected! Hence Execution Stops!')
                    return None,None
    
    print('{} column needs {}'.format(target,class_or_Reg))
    ues = time.time()
#     if key:userInteractVisualization(df.drop(key,axis=1),target)
#     else:userInteractVisualization(df,target)
    uee = time.time()
    print('Bi/Uni Variate Plotter time taken : {}'.format(uee-ues))

    # Remove all rows with Target Column Empty
    beforeIndex = df.index
    df.dropna(subset=[target],inplace=True)
    afterIndex = df.index
    rowsRemoved = list(set(beforeIndex)-set(afterIndex))
    print('\n {} rows were removed since target had these missing'.format(len(rowsRemoved)))
    del beforeIndex,afterIndex

    ############# TRAIN VALIDATION SPLIT ###########
    if class_or_Reg == 'Classification':                        
        LE = LabelEncoder()                        
        df[target] = LE.fit_transform(df[target])
        try:                      
            train,validation = train_test_split(df,test_size=0.2,random_state=1,stratify=df[target])    
        except:
            train,validation = train_test_split(df,test_size=0.2,random_state=1)               
    else:
        LE = None
        df[target].clip(lower=df[target].quantile(0.1),upper=df[target].quantile(0.9),inplace=True) 
        try:                       
            train,validation = train_test_split(df,test_size=0.2,random_state=1,stratify=df[target])
        except:
            train,validation = train_test_split(df,test_size=0.2,random_state=1)
    ############# TRAIN VALIDATION SPLIT ###########

    X = train.drop(target,axis=1)
    y = train[target]
    if key:
        X.drop(key,axis=1,inplace=True)
    del train
    del df

    # Remove columns and rows with more than 50% missing values
    print('\nRemoving Rows and Columns with more than 50% missing\n')
    X,y = DatasetSelection(X,y)
    
    ######## DATE ENGINEERING #######
    print('\n#### DATE ENGINEERING RUNNING WAIT ####')
    date_cols = getDateColumns(X.sample(1500) if len(X) > 1500 else X)
    print('Date Columns found are {}'.format(date_cols))
    if date_cols:         
        print('Respective columns will undergo date engineering and will be imputed in the function itself')
        print('\n#### DATE ENGINEERING RUNNING WAIT ####')
        try:
            DATE_DF = date_engineering(X[date_cols])
            print(DATE_DF.shape)
            DATE_DF.index = X.index
            X.drop(date_cols,axis=1,inplace=True)
            before = len(DATE_DF.columns)
            DATE_DF.dropna(axis=1,thresh=len(DATE_DF)*0.4,inplace=True)
            after = len(DATE_DF.columns)
            print('{} columns removed due to highly missing'.format(before-after))
            del before,after
            DATE_DF = DATE_DF.fillna(DATE_DF.mean()) 
        except:
            print('#### DATE ENGINEERING HAD ERRORS ####')
            X.drop(date_cols,axis=1,inplace=True)
            DATE_DF = pd.DataFrame(None)
    else:
        DATE_DF = pd.DataFrame(None)
    print(' #### DONE ####') 
    ######## DATE ENGINEERING #######
    
    ######## COLUMN SEGREGATION ########
    print('\n ### Entering Segregation Zone ### \n')
    # Feature Reduction and Segregation of discrete columns

    num_df, disc_df, useless_cols = Segregation(X)
    disc_df = disc_df.astype('category')
    disc_cat = {}
    for column in disc_df:
        disc_cat[column] = disc_df[column].cat.categories

    ############# OUTLIER REMOVAL ###########
    print('\n#### OUTLIER WINSORIZING ####')
    num_df.clip(lower=num_df.quantile(0.1),upper=num_df.quantile(0.9),inplace=True,axis=1)
    print(' #### DONE ####')
    ############# OUTLIER REMOVAL ###########
    
    ######## TEXT ENGINEERING #######
    start1 = time.time()
    start = time.time()
    some_list, remove_list = findReviewColumns(X[useless_cols])  #list1 contains list of usable comment columns, list2 contains list of unusable comment columns
    end = time.time()
    print("Extracting Review Columns time",end-start)
    if (some_list is None):
      TEXT_DF = pd.DataFrame(None)
      lda_models = pd.DataFrame(None)
      print("No review/comment columns found")
    else:
        try:
            print('Respective columns will undergo text engineering and will be imputed in the function itself')
            print('\n#### TEXT ENGINEERING RUNNING WAIT ####')  
            print("The review/comment columns found are", some_list)
            start = time.time()
            sentiment_frame = sentiment_analysis(X[some_list])
            sentiment_frame.fillna(value=0.0,inplace=True)
            print(sentiment_frame)
            #TEXT_DF = pd.concat([df, sentiment_frame], axis=1, sort=False)
            TEXT_DF = sentiment_frame.copy()
            TEXT_DF.reset_index(drop=True,inplace=True)
            end = time.time()
            print("Sentiment time",end-start)
            start = time.time()
            new_frame = X[some_list].copy()
            new_frame.fillna(value="None",inplace=True)
            lda_models = pd.DataFrame(index= range(5),columns=['Model'])
            ind = 0
            
            for col in new_frame.columns:
                topic_frame, lda_model = topicExtraction(new_frame[[col]])
                topic_frame.rename(columns={0:str(col)+"_Topic"},inplace=True)
                print(topic_frame)
                topic_frame.reset_index(drop=True, inplace=True)
                TEXT_DF = pd.concat([TEXT_DF, topic_frame], axis=1, sort=False)
                lda_models['Model'][ind] = lda_model
                ind = ind+1
            end = time.time()
            print("Topic time", end-start)
            X.drop(some_list,axis=1,inplace=True)
            X.drop(remove_list,axis=1, inplace=True)
            lda_models.dropna(axis=0,inplace=True)
        except:
            print('#### TEXT ENGINEERING HAD ERRORS ####')
            X.drop(some_list,inplace=True)
            if(remove_list):
                X.drop(remove_list,axis=1,inplace=True)
            TEXT_DF = pd.DataFrame(None)
            lda_models = pd.DataFrame(None)
    
    end2= time.time()
    
    print("total text analytics time taken =", end2-start1) 
    print("Text Engineering Result", TEXT_DF)
    
    #TEXT_DF holds the columns obtained from Text Engineering and
    #X contains the columns after Text imputation
    
    ########################### TEXT ENGINEERING #############################

    ############# PEARSON CORRELATION ############
    print('\n #### PEARSON CORRELATION ####')
    corr = num_df.corr(method='pearson')                                              
    corr = corr[(corr >= 0.85)]
    for column in corr.columns:                                                   
        corr.loc[column][column] = np.nan    
    corr.dropna(axis=1,how='all',inplace=True)                                    
    corr.dropna(axis=0,how='all',inplace=True)
    removed_cols = []
    if corr.shape!=(0,0):                                                                                                                    

        while corr.shape != (0,0):                                                  
            corr_dict = {}                                                            
            for column in corr.columns:                                               
                corr_dict[corr[column].max()] = column
            try:
                val = max(corr_dict)
                corr.drop(corr_dict[val],inplace=True)
                corr.drop(corr_dict[val],axis=1,inplace=True)
                corr.dropna(axis=1,how='all',inplace=True)
                corr.dropna(axis=0,how='all',inplace=True)
                removed_cols.append(corr_dict[val])
                del corr_dict[val]
            except ValueError:                                                        
                break
    num_df.drop(removed_cols,axis=1,inplace=True)
    print('\n{} columns removed which were highly correlated'.format(len(removed_cols)))
    print('The columns removed are {}'.format(removed_cols))
    print(' #### DONE ####') 
    ############# PEARSON CORRELATION ############
  
    num_df.reset_index(drop=True, inplace=True)
    disc_df.reset_index(drop=True, inplace=True)
    DATE_DF.reset_index(drop=True, inplace=True)
    TEXT_DF.reset_index(drop=True, inplace=True)
    concat_list = [num_df,disc_df,DATE_DF,TEXT_DF]
    X = pd.concat(concat_list,axis=1)
    
    print('\n #### TRANSFORMATIONS ####')
    TE = TargetEncoder(cols=disc_df.columns)
    print('\n #### TARGET ENCODING ####')
    te_start = time.time()
    X = TE.fit_transform(X,y)
    te_end = time.time()
    print('Target Encoding Time taken : {}'.format(te_end-te_start))
    print('\n #### FEATURE SELECTION ####')
    fe_s = time.time()
    rem,feat_df = FeatureSelection(X,y,class_or_Reg)
    fe_e = time.time()
    print('Feature Selection Time taken : {}'.format(fe_e-fe_s))
    X.drop(rem,axis=1,inplace=True)
    fe_s = time.time()
    try:
        featureSelectionPlot(feat_df[:15])
    except:
        print('\nFEATURE SELECTION PLOT DID NOT RUN SUCCESSFULLY!')
    fe_e = time.time()
    print('Feature Selection Plot Time taken : {}'.format(fe_e-fe_s))
    print('\n #### DECISION TREE VISUALIZATION ####')
    try:
        Visualization(X,y,class_or_Reg)
    except:
        print('#### VISUALIZATION DID NOT RUN AND HAD ERRORS ####')
    
    TrainingColumns = X.columns

    print('\n #### Printing Sample Equation of the DATA ####')
    try:
        SampleEquation(X,y,class_or_Reg)
    except:
        print('SAMPLE EQUATION HAD AN ERROR!!!')
    print(' #### DONE ####')

    print('\n #### NORMALIZATION ####')
    MM = MinMaxScaler()
    GG = MinMaxScaler()
    X = MM.fit_transform(X)
    print(' #### DONE ####')
    print('\n #### POWER TRANSFORMATIONS ####')
    PT = PowerTransformer(standardize=True)
    X = pd.DataFrame(PT.fit_transform(X))
    X.columns = TrainingColumns
    X = pd.DataFrame(GG.fit_transform(X))
    X.columns = TrainingColumns
    print(' #### DONE ####')
    

    print('\n #### SAVING MODEL INFORMATION ####')
    init_info = {'NumericColumns':num_df.columns,'NumericMean':num_df.mean().to_dict(),'DiscreteColumns':disc_df.columns,
                'DateColumns':date_cols,'DateFinalColumns':DATE_DF.columns,'DateMean':DATE_DF.mean().to_dict(),
                'TargetEncoder':TE,'MinMaxScaler':MM,'PowerTransformer':PT,'TargetLabelEncoder':LE,'Target':target,
                 'TrainingColumns':TrainingColumns,
                'ML':class_or_Reg,'KEY':key,'X_train':X,'y_train':y,'disc_cat':disc_cat,'q_s':info['q_s'],'some_list':some_list,'remove_list':remove_list,'lda_models':lda_models}
    print(' #### DONE ####')
    return init_info,validation                                                  

**VALIDATION TRAINING AND SCORING**

In [251]:
def validate(df,init_info):
    print('\n\t #### VALIDATION AND SCORING ZONE ####')
    
    X_train = init_info['X_train']
    y_train = init_info['y_train']
    
    if init_info['ML'] == 'Classification':  
        priorList = y_train.value_counts(normalize=True).values
    else:
        priorList = None

    X_test = df.drop(init_info['Target'],axis=1)
    y_test = df[init_info['Target']]

    if init_info['KEY']:
        k_test = df[init_info['KEY']]
        k_test.index = X_test.index
    else:
        k_test = X_test.index
        k_test.name = 'S.No'
    
    date_cols = init_info['DateColumns']
    if date_cols:
        DATE_DF = date_engineering(X_test[date_cols])
        DATE_DF = DATE_DF[init_info['DateFinalColumns']]
        DATE_DF.fillna(init_info['DateMean'],inplace=True)
    else:
        DATE_DF = pd.DataFrame()
    
    if len(init_info['NumericColumns'])!=0:
        num_df = X_test[init_info['NumericColumns']]
        num_df = num_df.swifter.apply(lambda x : pd.to_numeric(x,errors='coerce'))
        num_df.fillna(init_info['NumericMean'],inplace=True)
    else:
        num_df = pd.DataFrame()
        
    if len(init_info['DiscreteColumns'])!=0:
        disc_df = X_test[init_info['DiscreteColumns']]
        disc_cat = init_info['disc_cat']
        for col in disc_df.columns:
            disc_df[col] = disc_df[col].apply(lambda x: x if x in disc_cat[col] else 'others')
        disc_df.fillna('missing',inplace=True)
    else:
        disc_df = pd.DataFrame()
        
    if init_info['remove_list'] is not None:
        X_test.drop(columns=init_info['remove_list'],axis=1,inplace=True)
    
    some_list = init_info['some_list']
    lda_models = init_info['lda_models']
    
    if some_list:
        print("The review/comment columns found are", some_list)
        start = time.time()
        sentiment_frame = sentiment_analysis(X_test[some_list])
        sentiment_frame.fillna(value=0.0,inplace=True)
        print(sentiment_frame)
        TEXT_DF = sentiment_frame.copy()
        TEXT_DF.reset_index(drop=True,inplace=True)
        end = time.time()
        print("Sentiment time",end-start)
        start = time.time()
        new_frame = X_test[some_list].copy()
        new_frame.fillna(value="None",inplace=True)
        ind = 0
        for col in new_frame.columns:
            topic_frame, _ = topicExtraction(new_frame[[col]],True,lda_models['Model'][ind])
            topic_frame.rename(columns={0:str(col)+"_Topic"},inplace=True)
            print(topic_frame)
            topic_frame.reset_index(drop=True, inplace=True)
            TEXT_DF = pd.concat([TEXT_DF, topic_frame], axis=1, sort=False)
            ind = ind+1
        X_test.drop(some_list,axis=1,inplace=True)
    else:
        TEXT_DF = pd.DataFrame()
        
    
    print('\n #### TRANSFORMATION AND PREDICTION ####')
    num_df.reset_index(drop=True, inplace=True)
    disc_df.reset_index(drop=True, inplace=True)
    DATE_DF.reset_index(drop=True, inplace=True)
    TEXT_DF.reset_index(drop=True, inplace=True)
    X_test = pd.concat([num_df,disc_df,DATE_DF,TEXT_DF],axis=1)
    X_test = init_info['TargetEncoder'].transform(X_test)
    X_test = X_test[init_info['TrainingColumns']]
    X_test = X_test.fillna(X_test.mode())
    mm = init_info['MinMaxScaler']
    GG = MinMaxScaler()
    X_test.clip(mm.data_min_,mm.data_max_,inplace=True,axis=1) #Clip the data with training min and max, important
    X_test = mm.transform(X_test)
    X_test = pd.DataFrame(init_info['PowerTransformer'].transform(X_test),columns=init_info['TrainingColumns'])
    X_test = pd.DataFrame(GG.fit_transform(X_test),columns=init_info['TrainingColumns'])
    
    
    print('\n #### PRINTING THE LIST OF COLUMNS AND ITS TYPES THAT ENTER THE MODEL TRAINING ####')
    print('#### PRINTING X_test ####')
    print(X_test.columns)
    print(X_test.dtypes)
    print('\n')
    print(X_test.head(20))
    print('\n\n')
    print('#### PRINTING X_train ####')
    print(X_train.columns)
    print(X_train.dtypes)
    print('\n')
    print(X_train.head(20))
    print('\n\n')
    print(X_test.columns)
    print(X_test.dtypes)
    start = time.time()
    
    X_train.fillna(0,inplace=True)
    X_test.fillna(0,inplace=True)
    y_train.fillna(0,inplace=True)
    y_test.fillna(0,inplace=True)
    ############# MODEL TRAINING #############
    mod,model_info = model_training(X_train,y_train,X_test,y_test,init_info['ML'],priorList,init_info['q_s'])
    ############# MODEL TRAINING #############
    end = time.time()
    print('\nTotal Model Training Time taken : {}'.format(end-start))
    init_info['model']=mod
    init_info['model_info'] = model_info
    del init_info['X_train'],init_info['y_train']                  # This removes the data from dict to avoid storage
    joblib.dump(init_info,'ALL INFORMATION')
    print('MODEL SAVED')
    ############# PREDICTION/SCORING #############

#     xg = XGBClassifier()
#     xg = XGBRegressor()
#     xg.fit(X_train,y_train)
#     y_pred = xg.predict(X_test)  
#     mod = joblib.load('model')
    y_pred = mod.predict(X_test)  
    
    regplotdf=pd.DataFrame()
    regplotdf['y_test']=y_test
    regplotdf['y_pred']=y_pred
    
    if init_info['ML'] == 'Classification':
#         y_probas = xg.predict_proba(X_test)
        y_probas = mod.predict_proba(X_test)
        y_pred = init_info['TargetLabelEncoder'].inverse_transform(y_pred)
        y_test = pd.Series(init_info['TargetLabelEncoder'].inverse_transform(y_test))
        y_probs_cols = ['Class ' + str(x) +' Probabilities' for x in y_test.unique()]
        y_probas = pd.DataFrame(y_probas,columns=y_probs_cols)        
        
        from sklearn.metrics import classification_report
        print(classification_report(y_test,y_pred))

        skplt.metrics.plot_confusion_matrix(y_test, y_pred)
        if len(priorList) ==2:
            skplt.metrics.plot_lift_curve(y_test, y_probas)
            skplt.metrics.plot_cumulative_gain(y_test, y_probas)
        skplt.metrics.plot_roc(y_test, y_probas)
        
    else:
        import seaborn as sns
        import math
        
        #residual plot
        fig1 = sns.residplot('y_test','y_pred',regplotdf)
        plt.xlabel("Predicted Values")
        plt.ylabel("Residuals")
        plt.title("Residual Plot") 
        plt.show(fig1)

        #lm plot
        fig2 = sns.lmplot('y_pred','y_test',regplotdf,fit_reg =True)
        plt.xlabel("Predicted Values")
        plt.ylabel("Actual Values")
        plt.title("Predicted vs Actual") 
        plt.show(fig2)
        
        # decile plot function
        def decileplot(regplotdf):
            div=math.floor(len(regplotdf)/10)
            sorted_df= pd.DataFrame(regplotdf.sort_values('y_test',ascending=False))
            sorted_df['decile']=0
            for i in range(1,11):
                sorted_df.iloc[div*(i-1):div*i,2]= i
            sorted_df = sorted_df[sorted_df.decile != 0]
            df_mean=pd.DataFrame()
            df_mean[['Decile','Actualvalue_mean','Predictedvalue_mean']]=sorted_df.groupby('decile', as_index=False)[['y_test','y_pred']].mean()
            fig, ax1 = plt.subplots(figsize=(10, 7))
            plt.xticks(df_mean['Decile'])
            tidy = pd.melt(df_mean, id_vars='Decile', value_vars= ['Actualvalue_mean','Predictedvalue_mean'],value_name='Mean values per decile')
            sns.lineplot(x='Decile', y='Mean values per decile', hue='variable', data=tidy, ax=ax1)
            print(df_mean)

        # decile plot   
        decileplot(regplotdf)
        
#         return None
#         y_pred = xg.predict(X_test)
#         y_pred = mod.predict(X_test)

        y_probas = pd.Series()
        fig3 = plt.figure()
        plt.plot(y_pred, figure =fig3)
        plt.plot(np.ones(len(y_pred))*y_pred.mean(), figure=fig3)
        plt.show()
        
    ############ PREDICTION/SCORING #############

    preview_length = 100 if len(X_test)>100 else len(X_test)
    preview = pd.DataFrame({k_test.name:k_test.tolist(),
                            'Actual Values':y_test.tolist(),
                            'Predicted Values':y_pred.tolist()})
    if init_info['ML'] == 'Classification':
        preview = pd.concat([preview,y_probas],axis=1)
    preview = preview[:preview_length]
    preview.to_csv('preview.csv',sep=',',index=False)
    print('\nFile Saved as preview.csv')
    print('\nCode executed Successfully')
    print('\n############# END ###########')

## Input for Model Selection

In [252]:
def data_model_select(X_train,y_train):
  if len(X_train) <= 10000:
    input_X_train = X_train
    input_y_train = y_train
  elif len(X_train) > 10000 & len(X_train) <= 100000:
    input_X_train = X_train.sample(frac=0.8, random_state=1)
    input_y_train = y_train.sample(frac=0.8, random_state=1)
  elif len(X_train) > 100000 & len(X_train) < 1000000:
    input_X_train = X_train.sample(frac=0.7, random_state=1)
    input_y_train = y_train.sample(frac=0.7, random_state=1)
  else:
    input_X_train = X_train.sample(frac=0.5, random_state=1)
    input_y_train = y_train.sample(frac=0.5, random_state=1)
  return input_X_train,input_y_train

## Model Training

In [253]:
def model_training(X_train,y_train,X_test,y_test,class_or_Reg,priorList,q_s):
  # Selecting best model
  if class_or_Reg == 'Classification':
    Classification=classification()
    input_X_train,input_y_train=data_model_select(X_train,y_train)
    name,mod,acc,par,model_info = Classification.best_model_class(input_X_train, X_test, input_y_train.values, y_test.values,priorList,q_s)
  else:#Regression
    regression=Regression()
    input_X_train,input_y_train=data_model_select(X_train,y_train)
    name,mod,acc,par,model_info = regression.best_model_reg(input_X_train, X_test, input_y_train, y_test,q_s)
  print('Accuracy :',acc)
  # Saving the model
  filename = 'finalized_model.sav' 
  joblib.dump(mod, filename)
  return mod,model_info

**Input Data File**

In [254]:
############## CHANGE THE FILE NAME HERE ##############
path = input('Enter the path here : ')
error=False
if path:
    df = importFile(path,nrows=20)
    info = getUserInput(df)
    if not info:
        error = True
else:
    df = None
    print('\nQuitting Process\n')
    error = True
############## CHANGE THE FILE NAME HERE ##############


Quitting Process



**RUN THIS**

In [255]:
te = time.time()
try:
    if info:
        ################## TRAINING INIT ##################
        df = importFile(path,nrows=None)
        tts = time.time()
        if isinstance(df,pd.DataFrame):
            init_info,validation = INIT(df,info)
        else:
            init_info,validation = None,None
        tte = time.time()
        print('\n TOTAL TRAINING DATA CLEANING AND PLOTS : {}'.format(tte-tts))
        ################## TRAINING INIT ##################    

        if isinstance(validation,pd.DataFrame):
            ################## VALIDATION AND PREDICTION ##################
            validate(validation,init_info)
            ################## VALIDATION AND PREDICTION ##################
            print('\n\t #### CODE EXECUTED SUCCESSFULLY ####')
            print('\n\t #### END ####')
        else:
            print('\n\t #### CODE DID NOT RUN COMPLETELY ####')
except KeyboardInterrupt:
    print('QUITTING!')
ee = time.time()
print('\n#### TOTAL TIME TAKEN : {} ####'.format(ee-te))

#### RUNNING WAIT ####
Extension NOT FOUND!

 TOTAL TRAINING DATA CLEANING AND PLOTS : 0.019687891006469727

	 #### CODE DID NOT RUN COMPLETELY ####

#### TOTAL TIME TAKEN : 0.020446062088012695 ####


**Preview**

In [256]:
# View Preview Here
pd.read_csv('preview.csv')

Unnamed: 0,customerID,Actual Values,Predicted Values
0,6181-axxyf,1859.10,1784.182490
1,1268-asbga,1375.15,1368.936724
2,4159-naaix,5976.64,5921.268021
3,1219-nnddo,663.55,695.593345
4,9625-qstye,952.30,1040.160835
...,...,...,...
95,9541-pwtwo,4233.95,4232.534491
96,7663-yjhsn,5976.64,5976.746524
97,9445-zueqe,2151.60,2290.599542
98,8515-octjs,692.10,613.531973


# Model Information

In [257]:
model_info=pd.DataFrame()
model_info=joblib.load('ALL INFORMATION')['model_info'].drop(['model','param','accuracy'],axis=1)
model_info.sort_values('Accuracy%',ascending=False,inplace=True)
model_info

Unnamed: 0,Name,Accuracy%,RMSE,MSE,MAE,BIC,Total time(mins)
9,Ensemble CatBoost+Light GBM+Random Forest+Extr...,99.89%,66.651371,4442.405,44.204292,11868.068888,0.123249
4,ExtraTrees Regressor,99.88%,70.567262,4979.738,47.272046,12028.722129,0.006087
1,CatBoost,99.87%,73.1602,5352.415,47.527618,12130.265982,0.102405
2,Light GBM,99.87%,73.1602,5352.415,47.527618,12130.265982,0.001877
3,Random Forest,99.87%,71.267552,5079.064,46.725249,12056.509868,0.011212
0,XGBoost,99.84%,79.844724,6375.18,55.252573,12376.300315,0.010407
8,Support Vector Machine,9.29%,1916.584966,3673298.0,1733.518491,21319.801318,0.00357
5,Ridge Regression,89.98%,636.971291,405732.4,531.011,18219.967489,6.4e-05
6,Linear Regression,89.98%,636.950272,405705.6,530.79377,18219.874629,6e-05
7,Neural Net,-0.00%,2012.299318,4049349.0,1728.911211,21456.936174,0.038652


In [258]:
# validate(validation,init_info)

In [259]:
# import joblib

In [260]:
# X = joblib.load('X')
# y = joblib.load('y')

In [261]:
# date_cols = getDateColumns(X)

In [262]:
# date_cols

In [263]:
# date_engineering(X[date_cols])

In [264]:
# init_info,validation = INIT(df,info)