# Proton Scoring/Future Prediction

### Dependencies

In [1]:
# !pip3 install swifter
# !pip3 install xgboost
# !pip3 install tqdm
# !pip3 install category_encoders
# !pip3 install joblib
# !pip3 install scikit-plot
# !pip3 install catboost
# !pip3 install RegscorePy
# !pip3 install -U spacy
# !pip3 install gensim
# !pip3 install xlrd
# !pip3 install lightgbm
# !pip3 install hyperopt
# !pip3 install holidays
# !pip3 install textblob
# !python -m spacy download en_core_web_sm
# Download en_core_web_sm for text analytics

## Imports

In [2]:
from userInputs import importFile,dataHandler,duplicateHandler
from engineerings import *
from all_other_functions import getDF
from score import *
import joblib
import warnings
warnings.filterwarnings('ignore')

## Main Function

In [3]:
def main():
    # IMPORT MODEL    
    model_info = joblib.load('model_info')
    
    # IMPORT SCORING FILE
    dfPath = input('Enter the Scoring File path : ').strip()
    df,_ = importFile(None,None,dfPath)
    df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    df = duplicateHandler(df)
    df,update = dataHandler(df,None) # If first few rows contains unnecessary info
    df = duplicateHandler(df) #calling again if dataHandler drops columns
    print(df.columns)

    # Filter DataFrame based on columns
    df = getDF(df,model_info)
    if not isinstance(df,pd.DataFrame): # If Columns don't match, 
        print('QUITTING!') # QUIT by printing what columns don't match/are not found
        return 0
    
    # Numeric Engineering of DATA
    print('\n#### Entering Numeric Engineering ####\n')
    df = numeric_engineering(df)
    
    # SCORE
    outputFile = score(df,model_info)    
    
    print('Done!')
    return 0

if __name__ == '__main__':
    main()

Enter the Scoring File path : datasets/amlcopy.csv
#### RUNNING WAIT ####
extension is csv
We have a csv file
This file has 323 columns and 28240 rows
Index(['CONSTITUENTSYSTEMID', 'ASKAMOUNT1', 'ASKAMOUNT2', 'ASKAMOUNT3',
       'ASKAMOUNT4', 'ASKAMOUNT5', 'ASKAMOUNT6', 'ASKAMOUNT7', 'ASKAMOUNT8',
       'ASKAMOUNT9',
       ...
       'PDL_DEC_6025', 'CDOLB8', 'CRECL9', 'CTRNA9', 'CDOLL10', 'CRECL10',
       'CRECL12', 'CTRNA12', 'CDOLL10_AVGGIFT', 'CDOLL8_AVGGIFT'],
      dtype='object', length=323)
Columns Match!

#### Entering Numeric Engineering ####

		 stripping spaces, symbols, and lower casing all entries
done ...
		 Replacing empty and invalid strings
done ...
		 Replacing commas if present in Currencies
done ...
		 Finding Numeric Columns
done ...
		   CONSTITUENTSYSTEMID is of type object
		   ASKAMOUNT1 is of type float64
		   ASKAMOUNT2 is of type float64
		   ASKAMOUNT3 is of type float64
		   ASKAMOUNT4 is of type float64
		   ASKAMOUNT5 is of type float64
		   ASKAMOU


	 #### RUNNING WAIT ####

Visualizing Coloumns Generated
 {'CMP_A_FIRST_I_DATE': ['CMP_A_FIRST_I_DATE_month', 'CMP_A_FIRST_I_DATE_year', 'CMP_A_FIRST_I_DATE-today', 'CMP_A_FIRST_I_DATE_nearestHoliday'], 'CMP_A_LAST_I_DATE': ['CMP_A_LAST_I_DATE_month', 'CMP_A_LAST_I_DATE_year', 'CMP_A_LAST_I_DATE-today', 'CMP_A_LAST_I_DATE_nearestHoliday']}

The Following columns were generated to get days between dates of two seperate date columns
 ['CMP_A_FIRST_I_DATE-CMP_A_LAST_I_DATE']

Date Engineering Time Taken : 2.028787136077881

	 #### DONE ####


Pandas Apply:   0%|          | 0/138 [00:00<?, ?it/s]

Shape after Pearsons Correlation (28240, 110)
num_df - (28240, 110)
disc_df - (28240, 62)
DATE_DF - (28240, 9)
TEXT_DF - (0, 0)
LAT_LONG_DF - (0, 0)
EMAIL_DF - (0, 0)
URL_DF - (0, 0)
Applying Target Encoding...
Target Encoding completed
Applying Scaling and Transformations on Validation Set...
Scaling and Transformation completed

This is final shape of X_test : (28240, 110)

 #### PRINTING THE LIST OF COLUMNS AND ITS TYPES THAT ENTER THE MODEL TRAINING ####
#### PRINTING X_test ####
Index(['WEB_TOTAL_NUM_I', 'TSP_INDVEXACTAGEPRSN1', 'TSP_MT_INSURANCESWITCHER',
       'TSP_MT_SATELLITEBUNDLE', 'TSP_MT_ENVDONORS', 'TSP_POP_DENSITY',
       'TSP_POP_PCT1824', 'TSP_HH_PCT2PLUSPRSNHHNONFAM',
       'TSP_WRK_PCTTRANSTOWRKBICYCLE', 'TSP_HU_PCT1UNITDETACHED',
       ...
       'PDOLA15_RATIOCAT', 'WEB_FIRST_PMT_AMT', 'CDOLB5_AVGGIFT',
       'DRM_LAST_CAMPAIGN', 'CDOLB7_AVGGIFT', 'ALL_LAST_PMT_AMT',
       'WEB_LAST_PMT_AMT', 'DRM_LAST_PMT_AMT', 'CDOLA7_RATIOCAT',
       'CDOLL5_AVGGIFT'],
  

0 is present in 60.131% of the Scoring File
1 is present in 39.869% of the Scoring File


File Saved as score.csv

Code executed Successfully

############# END ###########
Done!


## Preview of Predictions

In [4]:
# joblib.load('model_info')

In [5]:
pd.read_csv('score.csv').head(20)

Unnamed: 0,S.No,Predicted Values,Class 0 Probabilities,Class 1 Probabilities
0,00000142-1B60-44B2-8E0C-2482C9A7B50E,0,0.51,0.49
1,0000027E-13B0-4F6F-8419-3FC02C702BF8,0,0.625,0.375
2,0000027E-13B0-4F6F-8419-3FC02C702BF8,0,0.64,0.36
3,00000897-C016-4D02-9393-35F93D7EB431,0,0.605,0.395
4,00000A9B-C1E8-4A64-99C9-59199271DD04,0,0.694,0.306
5,0000138B-08A0-45D7-A3FE-F741A9F40AC8,1,0.416,0.584
6,0000138B-08A0-45D7-A3FE-F741A9F40AC8,1,0.37,0.63
7,0000138B-08A0-45D7-A3FE-F741A9F40AC8,1,0.446,0.554
8,0000138B-08A0-45D7-A3FE-F741A9F40AC8,1,0.432,0.568
9,00003C14-12B4-4F02-B683-65F60181D4DA,0,0.611,0.389
