In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectPercentile as SP
from yellowbrick.model_selection import ValidationCurve, LearningCurve
from sklearn.neighbors import KNeighborsClassifier
import scipy.stats as stat
import pylab 
from sklearn.pipeline import Pipeline
import math
import time
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

#import warnings
#import ipdb
#warnings.filterwarnings("error")
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer


def log_transform(x):
    return np.log(x + 1)

def reciprocal_transform(x):
    return 1/(x+1)

log_transformer = FunctionTransformer(log_transform)
recriprocal_transformer = FunctionTransformer(reciprocal_transform)

scalers = [   
    ["StandardScaler", StandardScaler()   ],
    ["MinMaxScaler", MinMaxScaler()   ],
    ["MaxAbsScaler", MaxAbsScaler()   ],
    ["RobustScaler", RobustScaler(quantile_range=(25, 75))   ],
    ["Yeo-johnson", PowerTransformer(method="yeo-johnson")   ],
    ["Box-cox", Pipeline(steps=[('s', MinMaxScaler(feature_range=(1, 2))),('p',  PowerTransformer(method='box-cox'))])   ],
   ##["Box-cox", MinMaxScaler().fit_transform(PowerTransformer(method="box-cox")   )],   
    #["Box-cox", PowerTransformer(method="box-cox")   ],   
    ["QuantileTransformer (uniform)", QuantileTransformer(output_distribution="uniform")   ],
    ["QuantileTransformer (gaussian)", QuantileTransformer(output_distribution="normal")   ],
    ["Normalizer", Normalizer()   ],
    ["Logarithmic", Pipeline(steps=[('s', MinMaxScaler()),('p',log_transformer) ])   ],
    ["Recriprocal", Pipeline(steps=[('s', MinMaxScaler()),('p',recriprocal_transformer) ])   ]
]

def filterFeaturesByMutualInformation(data, target, percentile): 
    selector = SP(mutual_info_classif, percentile=percentile)
    m = selector.fit(data, target)
    filtered_features = selector.transform(data)

    columns = np.asarray(data.columns.values)
    support = np.asarray(selector.get_support())
    columns_with_support = columns[support]
    filtered_features = pd.DataFrame(filtered_features, columns = columns_with_support)
    
    #print("shape before mutual_info_classif: {}".format(data.shape))
    #print("shape after mutual_info_classif: {}".format(filtered_features.shape))

    return filtered_features

def buildModel(scalers, df, target, percentile):

    lr_results = []
    knn_results = []

    for scaler in scalers:         
        df_scaled = scaler[1].fit_transform(df)
        df_scaled = pd.DataFrame(df_scaled, columns = df.columns)

        train_features, test_features, train_labels, test_labels = train_test_split(df_scaled, target,
                                                                                    random_state=0, stratify=target)
        for solver in ['liblinear', 'newton-cg', 'lbfgs', 'sag','saga']:
            lrclf = LogisticRegression(solver=solver,n_jobs = -1)
            lrclf = lrclf.fit(train_features, train_labels)

            y_predicted = lrclf.predict(test_features)
            lr_score = metrics.accuracy_score(y_predicted, test_labels)
            lr_results.append(["LogisticRegression",percentile, solver ,scaler[0], lr_score])

        scores = []
        Training_Accuracy = []
        Testing_Accuracy = []
       
        for i in range(5, 150, 5):
            for distance in [[1, "manhattan_distance (l1)"], [2,  "euclidean_distance (l2)"]]:
                knn = neighbors.KNeighborsClassifier(n_neighbors=i, p = distance[0])
                knn = knn.fit(train_features, train_labels)            
                y_predicted = knn.predict(test_features)           
                knn_score = metrics.accuracy_score(y_predicted, test_labels)     
                knn_results.append(["KNN - {}".format(distance[1]), percentile, i, scaler[0],knn_score ])
      
    top3_logisticRegression = sorted(lr_results, key = lambda x: x[4], reverse = True)[:3]  
    top3_knn = sorted(knn_results, key = lambda x: x[4], reverse = True)[:3]  
        
    return top3_logisticRegression + top3_knn

def buildModelCrossValidation(scalers, df, target, percentile):
    cv_score_lr = []
    cv_score_knn = []

    for scaler in scalers:
        for solver in ['liblinear', 'newton-cg', 'lbfgs', 'sag','saga']:
            pipe = Pipeline([('scaler', scaler[1]), ('lr', LogisticRegression(solver=solver))])                    
            score = cross_val_score(pipe, 
                                    X=df, 
                                    y=target,
                                    scoring= 'accuracy',
                                    cv=5).mean()

            cv_score_lr.append(["LogisticRegression", percentile, solver, scaler[0], score])
        
        for i in range(5, 150, 5):
            pipe = Pipeline([('scaler', scaler[1]), ('knn', neighbors.KNeighborsClassifier(n_neighbors=i))])                    
            score = cross_val_score(pipe, 
                                X=df, 
                                y=target,
                                scoring= 'accuracy',
                                cv=5).mean()
            cv_score_knn.append(["KNN", percentile, i, scaler[0],  score])

    top3_logisticRegression = sorted(cv_score_lr, key = lambda x: x[4], reverse = True)[:3]  
    top3_knn = sorted(cv_score_knn, key = lambda x: x[4], reverse = True)[:3]  
        
    return top3_logisticRegression + top3_knn


In [2]:
dfData = pd.read_csv("classification/2018_Financial_Data.csv")
dfData

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2019 PRICE VAR [%],Class
0,CMCSA,9.450700e+10,0.1115,0.000000e+00,9.450700e+10,0.000000e+00,6.482200e+10,7.549800e+10,1.900900e+10,3.542000e+09,...,0.2570,0.0000,0.3426,0.0722,0.7309,0.0000,0.1308,Consumer Cyclical,32.794573,1
1,KMI,1.414400e+10,0.0320,7.288000e+09,6.856000e+09,0.000000e+00,6.010000e+08,3.062000e+09,3.794000e+09,1.917000e+09,...,0.0345,-0.0920,-0.0024,0.0076,-0.0137,0.0000,-0.1265,Energy,40.588068,1
2,INTC,7.084800e+10,0.1289,2.711100e+10,4.373700e+10,1.354300e+10,6.750000e+09,2.042100e+10,2.331600e+10,-1.260000e+08,...,0.1989,0.0387,0.0382,0.1014,-0.0169,0.0390,-0.0942,Technology,30.295514,1
3,MU,3.039100e+10,0.4955,1.250000e+10,1.789100e+10,2.141000e+09,8.130000e+08,2.897000e+09,1.499400e+10,3.420000e+08,...,0.4573,0.1511,0.2275,0.6395,-0.5841,0.1738,0.0942,Technology,64.213737,1
4,GE,1.216150e+11,0.0285,9.546100e+10,2.615400e+10,0.000000e+00,1.811100e+10,4.071100e+10,-1.455700e+10,5.059000e+09,...,-0.2781,-0.2892,-0.1575,-0.4487,-0.2297,0.0000,0.0308,Industrials,44.757840,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4387,YRIV,0.000000e+00,0.0000,0.000000e+00,0.000000e+00,0.000000e+00,3.755251e+06,3.755251e+06,-3.755251e+06,1.105849e+07,...,0.0000,0.0000,-0.0508,-0.1409,-0.0152,0.0000,-0.2602,Real Estate,-90.962099,0
4388,YTEN,5.560000e+05,-0.4110,0.000000e+00,5.560000e+05,4.759000e+06,5.071000e+06,9.830000e+06,-9.274000e+06,0.000000e+00,...,0.3445,0.0000,-0.2323,-0.8602,0.0000,0.0352,-0.0993,Basic Materials,-77.922077,0
4389,ZKIN,5.488438e+07,0.2210,3.659379e+07,1.829059e+07,1.652633e+06,7.020320e+06,8.672953e+06,9.617636e+06,1.239170e+06,...,0.1605,0.7706,0.2489,0.4074,-0.0968,0.2415,0.8987,Basic Materials,-17.834400,0
4390,ZOM,0.000000e+00,0.0000,0.000000e+00,0.000000e+00,1.031715e+07,4.521349e+06,1.664863e+07,-1.664863e+07,0.000000e+00,...,0.8980,0.0000,0.1568,-0.2200,0.0000,2.7499,0.1457,Industrials,-73.520000,0


In [3]:
dfData.describe()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,2019 PRICE VAR [%],Class
count,4346.0,4253.0,4207.0,4328.0,4155.0,4226.0,4208.0,4357.0,4208.0,4321.0,...,4067.0,4268.0,4160.0,4178.0,4121.0,4128.0,4133.0,4144.0,4392.0,4392.0
mean,5119287000.0,3.455278,3144946000.0,2043954000.0,118017600.0,900502200.0,1435546000.0,654120700.0,100135000.0,558443200.0,...,0.006081,36.768524,0.183066,1.389013,0.26253,9.928446,0.091891,0.15361,20.803948,0.693534
std,20495040000.0,195.504906,15088130000.0,7682369000.0,933089100.0,3661116000.0,5529831000.0,2969341000.0,378002100.0,2639327000.0,...,0.239653,2347.079237,4.688013,35.123904,5.612666,363.717734,0.823281,0.839647,82.622147,0.461078
min,-68941000.0,-3.4615,-2669055000.0,-1818220000.0,-104200000.0,-140159400.0,-4280000000.0,-14557000000.0,-1408252000.0,-21772000000.0,...,-1.0,-1.0,-1.0,-0.9991,-32.2581,-1.0,-1.0,-1.0,-99.864779,0.0
25%,65014250.0,0.0,3415500.0,36189030.0,0.0,20562260.0,42236440.0,-5510000.0,0.0,-10008000.0,...,0.0,-0.048075,0.0,-0.0367,-0.1086,-0.08285,0.0,-0.00465,-7.477173,0.0
50%,498264000.0,0.0749,174118000.0,221947000.0,0.0,93904500.0,180625300.0,42038000.0,5693500.0,27307000.0,...,0.0,0.0102,0.0,0.03475,0.0261,0.0,0.0,0.0657,17.639393,1.0
75%,2457878000.0,0.1885,1297814000.0,976701500.0,14501500.0,411716200.0,679604000.0,286269000.0,58170750.0,223881000.0,...,0.04205,0.1859,0.08005,0.160575,0.1384,0.115425,0.0097,0.167625,39.625879,1.0
max,500343000000.0,12739.0,373396000000.0,126947000000.0,28837000000.0,106510000000.0,106510000000.0,70898000000.0,9168000000.0,72903000000.0,...,4.0791,153332.3333,293.473,1184.9938,313.3958,17646.8235,36.8981,43.7188,3756.716345,1.0


In [4]:
le = LabelEncoder()
dfData["Sector"] = le.fit_transform(dfData['Sector'])
dfData.head()

dfData = dfData.rename(columns={"Unnamed: 0": "Symbol"})

features = dfData.drop(['Symbol', 'Class'], axis=1)
features = features.loc[:, ~features.columns.str.endswith('PRICE VAR [%]')]
target = dfData["Class"]

features = features.fillna(features.mean())

top_quantiles = features.quantile(0.97)
outliers_top = (features > top_quantiles)

low_quantiles = features.quantile(0.03)
outliers_low = (features < low_quantiles)

features = features.mask(outliers_top, top_quantiles, axis=1)
features = features.mask(outliers_low, low_quantiles, axis=1)
features

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,5Y Dividend per Share Growth (per Share),3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector
0,3.341463e+10,0.11150,0.000000e+00,1.578372e+10,0.000000e+00,6.518087e+09,1.009406e+10,5.147420e+09,699270000.0,4.334020e+09,...,0.186500,0.234800,0.2570,0.0000,0.3426,0.0722,0.7309,0.000000,0.130800,2
1,1.414400e+10,0.03200,7.288000e+09,6.856000e+09,0.000000e+00,6.010000e+08,3.062000e+09,3.794000e+09,699270000.0,2.196000e+09,...,-0.142100,-0.278500,0.0345,-0.0920,-0.0024,0.0076,-0.0137,0.000000,-0.126500,4
2,3.341463e+10,0.12890,1.840840e+10,1.578372e+10,4.882647e+08,6.518087e+09,1.009406e+10,5.147420e+09,0.0,4.334020e+09,...,0.059200,0.077200,0.1989,0.0387,0.0382,0.1014,-0.0169,0.039000,-0.094200,9
3,3.039100e+10,0.49550,1.250000e+10,1.578372e+10,4.882647e+08,8.130000e+08,2.897000e+09,5.147420e+09,342000000.0,4.334020e+09,...,0.000000,0.000000,0.4573,0.1511,0.2275,0.6395,-0.5841,0.173800,0.094200,9
4,3.341463e+10,0.02850,1.840840e+10,1.578372e+10,0.000000e+00,6.518087e+09,1.009406e+10,-1.792975e+08,699270000.0,-2.255230e+08,...,-0.140800,-0.261900,-0.2781,-0.2892,-0.1575,-0.4487,-0.2297,0.000000,0.030800,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4387,0.000000e+00,0.00000,0.000000e+00,0.000000e+00,0.000000e+00,3.755251e+06,4.373218e+06,-3.755251e+06,11058486.0,-1.482451e+07,...,-0.010214,0.000000,0.0000,0.0000,-0.0508,-0.1409,-0.0152,0.000000,-0.218608,8
4388,5.560000e+05,-0.39647,0.000000e+00,5.560000e+05,4.759000e+06,5.071000e+06,9.830000e+06,-9.274000e+06,0.0,-9.170000e+06,...,0.000000,0.000000,0.3445,0.0000,-0.2323,-0.8551,0.0000,0.035200,-0.099300,0
4389,5.488438e+07,0.22100,3.659379e+07,1.829059e+07,1.652633e+06,7.020320e+06,8.672953e+06,9.617636e+06,1239170.0,8.416324e+06,...,-0.010214,0.000000,0.1605,0.7706,0.2489,0.4074,-0.0968,0.241500,0.796654,0
4390,0.000000e+00,0.00000,0.000000e+00,0.000000e+00,1.031715e+07,4.521349e+06,1.664863e+07,-1.664863e+07,0.0,-1.664769e+07,...,-0.010214,0.006081,0.8980,0.0000,0.1568,-0.2200,0.0000,0.770423,0.145700,7


In [5]:
start_time = time.time()
results = []

for percentile in [5,10,50]:
    filtered_features = filterFeaturesByMutualInformation(features, target, percentile)
    print("Percentile: {}, features: {} --> Processing".format(percentile,len(filtered_features.columns)))
    
    results += buildModel(scalers,filtered_features, target,percentile)    

end_time = time.time()   
seconds  = end_time- start_time   

print("{} seconds".format(seconds) )    
print("{} minutes".format(seconds/60) )    

results_df = pd.DataFrame(results, columns = ["Classifier","Percentile","K/Solver", "Scaler", "Score"])
results_df = results_df.sort_values(['Score'], ascending=[False])
results_df

Percentile: 5, features: 12 --> Processing
Percentile: 10, features: 23 --> Processing
Percentile: 50, features: 111 --> Processing
483.05267548561096 seconds
8.050877924760183 minutes


Unnamed: 0,Classifier,Percentile,K/Solver,Scaler,Score
15,KNN - euclidean_distance (l2),50,125,QuantileTransformer (uniform),0.741348
17,KNN - euclidean_distance (l2),50,100,QuantileTransformer (uniform),0.740437
16,KNN - euclidean_distance (l2),50,65,QuantileTransformer (uniform),0.740437
9,KNN - manhattan_distance (l1),10,105,RobustScaler,0.739526
1,LogisticRegression,5,newton-cg,Yeo-johnson,0.737705
10,KNN - manhattan_distance (l1),10,75,RobustScaler,0.737705
2,LogisticRegression,5,lbfgs,Yeo-johnson,0.737705
0,LogisticRegression,5,liblinear,Yeo-johnson,0.737705
6,LogisticRegression,10,liblinear,QuantileTransformer (gaussian),0.735883
7,LogisticRegression,10,newton-cg,QuantileTransformer (gaussian),0.735883


In [8]:
results_df['Score'].mean()

0.717961006401225

In [6]:
local_scalers = [   
    ["StandardScaler", StandardScaler()   ],   
    ["Normalizer", Normalizer()   ]   ,
    ["QuantileTransformer (gaussian)", QuantileTransformer(output_distribution="normal")   ]
]

print("Cross validation")

start_time = time.time()
results = []

for percentile in [5,10,50]:
    filtered_features = filterFeaturesByMutualInformation(features, target, percentile)
    print("Percentile: {}, features: {} --> Processing".format(percentile,len(filtered_features.columns)))
    
    results += buildModelCrossValidation(scalers,filtered_features, target, percentile)

end_time = time.time()   
seconds  = end_time- start_time   

print(end_time)
print("{} seconds".format(seconds) )    
print("{} minutes".format(seconds/60) )    


results_df = pd.DataFrame(results, columns = ["Classifier","Percentile","K/Solver", "Scaler", "Score"])
results_df = results_df.sort_values(['Score'], ascending=[False])
results_df

Cross validation
Percentile: 5, features: 12 --> Processing
Percentile: 10, features: 23 --> Processing
Percentile: 50, features: 111 --> Processing
1640884377.2304502
1387.5205721855164 seconds
23.125342869758605 minutes


Unnamed: 0,Classifier,Percentile,K/Solver,Scaler,Score
12,LogisticRegression,50,newton-cg,Normalizer,0.723137
14,LogisticRegression,50,sag,Normalizer,0.723137
13,LogisticRegression,50,lbfgs,Normalizer,0.723137
6,LogisticRegression,10,liblinear,StandardScaler,0.720859
7,LogisticRegression,10,lbfgs,StandardScaler,0.720859
8,LogisticRegression,10,newton-cg,StandardScaler,0.720631
15,KNN,50,140,Normalizer,0.718352
16,KNN,50,85,Normalizer,0.718124
17,KNN,50,45,Normalizer,0.717897
9,KNN,10,95,QuantileTransformer (uniform),0.717669


In [7]:
results_df['Score'].mean()

0.717961006401225