In [232]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectPercentile as SP
from yellowbrick.model_selection import ValidationCurve, LearningCurve
from sklearn.neighbors import KNeighborsClassifier
import scipy.stats as stat
import pylab 
from sklearn.pipeline import Pipeline
import math
import time
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

#import warnings
#import ipdb
#warnings.filterwarnings("error")
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer


def log_transform(x):
    return np.log(x + 1)

def reciprocal_transform(x):
    return 1/(x+1)

log_transformer = FunctionTransformer(log_transform)
recriprocal_transformer = FunctionTransformer(reciprocal_transform)

scalers = [   
    ["StandardScaler", StandardScaler()   ],
    ["MinMaxScaler", MinMaxScaler()   ],
    ["MaxAbsScaler", MaxAbsScaler()   ],
    ["RobustScaler", RobustScaler(quantile_range=(25, 75))   ],
    ["Yeo-johnson", PowerTransformer(method="yeo-johnson")   ],
    ["Box-cox", Pipeline(steps=[('s', MinMaxScaler(feature_range=(1, 2))),('p',  PowerTransformer(method='box-cox'))])   ],
   ##["Box-cox", MinMaxScaler().fit_transform(PowerTransformer(method="box-cox")   )],   
    #["Box-cox", PowerTransformer(method="box-cox")   ],   
    ["QuantileTransformer (uniform)", QuantileTransformer(output_distribution="uniform")   ],
    ["QuantileTransformer (gaussian)", QuantileTransformer(output_distribution="normal")   ],
    ["Normalizer", Normalizer()   ],
    ["Logarithmic", Pipeline(steps=[('s', MinMaxScaler()),('p',log_transformer) ])   ],
    ["Recriprocal", Pipeline(steps=[('s', MinMaxScaler()),('p',recriprocal_transformer) ])   ]
]

def filterFeaturesByMutualInformation(percentile): 
    selector = SP(mutual_info_classif, percentile=percentile)
    m = selector.fit(data, target)
    filtered_features = selector.transform(data)

    columns = np.asarray(data.columns.values)
    support = np.asarray(selector.get_support())
    columns_with_support = columns[support]
    filtered_features = pd.DataFrame(filtered_features, columns = columns_with_support)
    
    #print("shape before mutual_info_classif: {}".format(data.shape))
    #print("shape after mutual_info_classif: {}".format(filtered_features.shape))

    return filtered_features

def buildModel(scalers, df, target, percentile):

    lr_results = []
    knn_results = []

    for scaler in scalers:         
        df_scaled = scaler[1].fit_transform(df)
        df_scaled = pd.DataFrame(df_scaled, columns = df.columns)

        train_features, test_features, train_labels, test_labels = train_test_split(df_scaled, target,
                                                                                    random_state=0, stratify=target)
        for solver in ['liblinear', 'newton-cg', 'lbfgs', 'sag','saga']:
            lrclf = LogisticRegression(solver=solver,n_jobs = -1)
            lrclf = lrclf.fit(train_features, train_labels)

            y_predicted = lrclf.predict(test_features)
            lr_score = metrics.accuracy_score(y_predicted, test_labels)
            lr_results.append(["LogisticRegression",percentile, solver ,scaler[0], lr_score])

        scores = []
        Training_Accuracy = []
        Testing_Accuracy = []
       
        for i in range(5, 150, 5):
            for distance in [[1, "manhattan_distance (l1)"], [2,  "euclidean_distance (l2)"]]:
                knn = neighbors.KNeighborsClassifier(n_neighbors=i, p = distance[0])
                knn = knn.fit(train_features, train_labels)            
                y_predicted = knn.predict(test_features)           
                knn_score = metrics.accuracy_score(y_predicted, test_labels)     
                knn_results.append(["KNN - {}".format(distance[1]), percentile, i, scaler[0],knn_score ])
      
    top3_logisticRegression = sorted(lr_results, key = lambda x: x[4], reverse = True)[:3]  
    top3_knn = sorted(knn_results, key = lambda x: x[4], reverse = True)[:3]  
        
    return top3_logisticRegression + top3_knn

def buildModelCrossValidation(scalers, df, target, percentile):
    cv_score_lr = []
    cv_score_knn = []

    for scaler in scalers:
        for solver in ['liblinear', 'newton-cg', 'lbfgs', 'sag','saga']:
            pipe = Pipeline([('scaler', scaler[1]), ('lr', LogisticRegression(solver=solver))])                    
            score = cross_val_score(pipe, 
                                    X=df, 
                                    y=target,
                                    scoring= 'accuracy',
                                    cv=5).mean()

            cv_score_lr.append(["LogisticRegression", percentile, solver, scaler[0], score])
        
        for i in range(5, 150, 5):
            pipe = Pipeline([('scaler', scaler[1]), ('knn', neighbors.KNeighborsClassifier(n_neighbors=i))])                    
            score = cross_val_score(pipe, 
                                X=df, 
                                y=target,
                                scoring= 'accuracy',
                                cv=5).mean()
            cv_score_knn.append(["KNN", percentile, i, scaler[0],  score])

    top3_logisticRegression = sorted(cv_score_lr, key = lambda x: x[4], reverse = True)[:3]  
    top3_knn = sorted(cv_score_knn, key = lambda x: x[4], reverse = True)[:3]  
        
    return top3_logisticRegression + top3_knn


In [233]:
dfData = pd.read_csv("classification/2016_Financial_Data.csv")
dfData

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2017 PRICE VAR [%],Class
0,PG,6.529900e+10,-0.0770,3.290900e+10,3.239000e+10,0.000000e+00,1.894900e+10,1.894900e+10,1.344100e+10,5.790000e+08,...,-0.0427,-0.0528,-0.0182,-0.0627,0.0083,0.0000,-0.0809,Consumer Defensive,12.532463,1
1,VIPS,8.148496e+09,0.3156,6.190740e+09,1.957756e+09,2.251378e+08,6.880959e+08,1.567877e+09,3.898789e+08,1.226710e+07,...,0.9234,0.0128,0.1706,0.5109,-0.0139,0.3575,0.3172,Consumer Defensive,4.363319,1
2,KR,1.098300e+11,0.0126,8.549600e+10,2.433400e+10,0.000000e+00,1.794600e+10,2.075800e+10,3.576000e+09,4.820000e+08,...,0.3697,0.0844,0.1115,0.2797,0.0416,0.0000,0.0457,Consumer Defensive,-17.068252,0
3,RAD,2.077024e+10,-0.2171,1.577826e+10,4.991979e+09,0.000000e+00,4.581171e+09,4.621042e+09,3.709370e+08,1.861320e+08,...,0.6322,-0.0645,0.2848,8.5628,0.2581,0.0000,-0.3158,Consumer Defensive,-75.916870,0
4,GIS,1.656310e+10,-0.0605,1.073360e+10,5.829500e+09,0.000000e+00,3.118900e+09,3.270300e+09,2.559200e+09,3.038000e+08,...,-0.0187,-0.0825,-0.0055,-0.0105,-0.0828,0.0000,-0.0628,Consumer Defensive,-1.162942,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4792,TSRI,6.099828e+07,0.0626,5.103888e+07,9.959402e+06,0.000000e+00,9.120526e+06,9.120526e+06,8.388760e+05,0.000000e+00,...,-0.1191,0.0000,0.0027,0.0441,0.0000,0.0000,0.0267,Technology,0.655807,1
4793,TZOO,1.142630e+08,-0.0782,1.385500e+07,1.004080e+08,9.096000e+06,8.112600e+07,9.022200e+07,1.018600e+07,0.000000e+00,...,-0.1209,0.0000,-0.2194,-0.1115,-1.0000,-0.2553,-0.0964,Technology,-35.500002,0
4794,USATP,7.700000e+07,,5.500000e+07,2.200000e+07,,2.200000e+07,2.300000e+07,-1.000000e+06,1.000000e+06,...,,,,,,,,Technology,14.840183,1
4795,WSTG,1.646090e+08,-0.5692,1.372780e+08,2.733100e+07,0.000000e+00,1.871500e+07,1.871500e+07,8.616000e+06,-3.180000e+05,...,0.4233,0.1894,0.2085,0.0012,0.0000,0.0000,0.0361,Technology,-2.854095,0


In [234]:
dfData.describe()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,2017 PRICE VAR [%],Class
count,4308.0,4148.0,4157.0,4305.0,4084.0,4184.0,4177.0,4332.0,4177.0,4266.0,...,3667.0,4157.0,4072.0,4075.0,3924.0,4001.0,4025.0,4041.0,4797.0,4797.0
mean,4802586000.0,1.520193,3060024000.0,1824267000.0,100748400.0,830865100.0,1287227000.0,523926000.0,94516150.0,430979200.0,...,0.005076,0.812488,0.18501,0.583518,0.759701,2.593773,0.149195,0.252307,365.026,0.670836
std,32610580000.0,52.397806,27053050000.0,7913180000.0,734702400.0,3465465000.0,4930970000.0,2548440000.0,435506700.0,2190013000.0,...,0.282685,21.477336,3.848082,14.082608,91.44378,54.310889,3.591208,3.349447,19401.49,0.469959
min,-381927000.0,-12.7693,-2639031000.0,-3564190000.0,-86100000.0,-52077920.0,-935700000.0,-6770000000.0,-1107000000.0,-7621000000.0,...,-1.0,-1.0,-1.0,-1.0,-3963.1312,-1.0,-2.6622,-2.099,-99.99941,0.0
25%,52122250.0,-0.047275,2699000.0,26598820.0,0.0,16967750.0,34313000.0,-5629444.0,0.0,-9890840.0,...,0.0,-0.0493,0.0,-0.0407,-0.12295,-0.0801,0.0,-0.0277,-6.730768,0.0
50%,405975000.0,0.0323,142214000.0,176462000.0,0.0,75840500.0,150905000.0,30433000.0,3275789.0,19939000.0,...,0.0,0.006,0.0,0.0401,0.02565,0.0,0.0,0.0487,10.90686,1.0
75%,2056847000.0,0.160525,1106300000.0,800387000.0,12377500.0,350553000.0,594136000.0,224254800.0,47175000.0,165505500.0,...,0.04005,0.1991,0.0285,0.1467,0.12255,0.1249,0.0,0.1792,31.55634,1.0
max,1822805000000.0,3234.4568,1548384000000.0,274421000000.0,16085000000.0,97041000000.0,97041000000.0,60024000000.0,15176000000.0,61372000000.0,...,5.4005,1340.6957,217.0417,636.1569,3710.6667,2893.7368,224.8182,122.8957,1321281.0,1.0


In [235]:
le = LabelEncoder()
dfData["Sector"] = le.fit_transform(dfData['Sector'])
dfData.head()

dfData = dfData.rename(columns={"Unnamed: 0": "Symbol"})

features = dfData.drop(['Symbol', 'Class'], axis=1)
features = features.loc[:, ~features.columns.str.endswith('PRICE VAR [%]')]
target = dfData["Class"]

features = features.fillna(features.mean())

top_quantiles = features.quantile(0.97)
outliers_top = (features > top_quantiles)

low_quantiles = features.quantile(0.03)
outliers_low = (features < low_quantiles)

features = features.mask(outliers_top, top_quantiles, axis=1)
features = features.mask(outliers_low, low_quantiles, axis=1)
features

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,5Y Dividend per Share Growth (per Share),3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector
0,2.790084e+10,-0.077000,1.591400e+10,1.176168e+10,0.000000e+00,5.053756e+09,7.986360e+09,3.792815e+09,5.790000e+08,3.231474e+09,...,0.061700,0.051200,-0.042700,-0.05280,-0.018200,-0.062700,0.008300,0.000000,-0.080900,3
1,8.148496e+09,0.315600,6.190740e+09,1.957756e+09,2.251378e+08,6.880959e+08,1.567877e+09,3.898789e+08,1.226710e+07,3.799345e+08,...,0.003764,0.000000,0.923400,0.01280,0.170600,0.510900,-0.013900,0.357500,0.317200,3
2,2.790084e+10,0.012600,1.591400e+10,1.176168e+10,0.000000e+00,5.053756e+09,7.986360e+09,3.576000e+09,4.820000e+08,3.084000e+09,...,0.207200,0.265000,0.369700,0.08440,0.111500,0.279700,0.041600,0.000000,0.045700,3
3,2.077024e+10,-0.217100,1.577826e+10,4.991979e+09,0.000000e+00,4.581171e+09,4.621042e+09,3.709370e+08,1.861320e+08,2.149770e+08,...,0.000000,0.000000,0.632200,-0.06450,0.284800,1.379172,0.258100,0.000000,-0.279184,3
4,1.656310e+10,-0.060500,1.073360e+10,5.829500e+09,0.000000e+00,3.118900e+09,3.270300e+09,2.559200e+09,3.038000e+08,2.452600e+09,...,0.097100,0.104800,-0.018700,-0.08250,-0.005500,-0.010500,-0.082800,0.000000,-0.062800,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4792,6.099828e+07,0.062600,5.103888e+07,9.959402e+06,0.000000e+00,9.120526e+06,9.120526e+06,8.388760e+05,0.000000e+00,7.882260e+05,...,0.000000,-0.532172,-0.119100,0.00000,0.002700,0.044100,0.000000,0.000000,0.026700,9
4793,1.142630e+08,-0.078200,1.385500e+07,1.004080e+08,9.096000e+06,8.112600e+07,9.022200e+07,1.018600e+07,0.000000e+00,1.062300e+07,...,0.000000,0.000000,-0.120900,0.00000,-0.219400,-0.111500,-0.852660,-0.255300,-0.096400,9
4794,7.700000e+07,1.520193,5.500000e+07,2.200000e+07,1.007484e+08,2.200000e+07,2.300000e+07,-1.000000e+06,1.000000e+06,-7.000000e+06,...,0.003764,0.005076,0.812488,0.18501,0.583518,0.759701,2.593773,0.149195,0.252307,9
4795,1.646090e+08,-0.503920,1.372780e+08,2.733100e+07,0.000000e+00,1.871500e+07,1.871500e+07,8.616000e+06,0.000000e+00,8.933000e+06,...,0.012200,0.015200,0.423300,0.18940,0.208500,0.001200,0.000000,0.000000,0.036100,9


In [236]:
start_time = time.time()
results = []

for percentile in [5,10,50]:
    filtered_features = filterFeaturesByMutualInformation(percentile)
    print("Percentile: {}, features: {} --> Processing".format(percentile,len(filtered_features.columns)))
    
    results += buildModel(scalers,filtered_features, target,percentile)    

end_time = time.time()   
seconds  = end_time- start_time   

print("{} seconds".format(seconds) )    
print("{} minutes".format(seconds/60) )    

results_df = pd.DataFrame(results, columns = ["Classifier","Percentile","K/Solver", "Scaler", "Score"])
results_df = results_df.sort_values(['Score'], ascending=[False])
results_df

Percentile: 5, features: 11 --> Processing
Percentile: 10, features: 22 --> Processing
Percentile: 50, features: 110 --> Processing
572.3180232048035 seconds
9.538633720080059 minutes


Unnamed: 0,Classifier,Percentile,K/Solver,Scaler,Score
15,KNN - manhattan_distance (l1),50,25,QuantileTransformer (uniform),0.695833
16,KNN - manhattan_distance (l1),50,25,Normalizer,0.694167
9,KNN - manhattan_distance (l1),10,35,QuantileTransformer (uniform),0.6925
17,KNN - manhattan_distance (l1),50,15,Yeo-johnson,0.691667
3,KNN - manhattan_distance (l1),5,55,RobustScaler,0.691667
4,KNN - manhattan_distance (l1),5,65,RobustScaler,0.69
5,KNN - euclidean_distance (l2),5,145,RobustScaler,0.69
10,KNN - euclidean_distance (l2),10,120,Yeo-johnson,0.689167
11,KNN - euclidean_distance (l2),10,125,Yeo-johnson,0.688333
0,LogisticRegression,5,liblinear,QuantileTransformer (gaussian),0.684167


In [237]:
local_scalers = [   
    ["StandardScaler", StandardScaler()   ],   
    ["Normalizer", Normalizer()   ]   ,
    ["QuantileTransformer (gaussian)", QuantileTransformer(output_distribution="normal")   ]
]

print("Cross validation")

start_time = time.time()
results = []

for percentile in [5,10,50]:
    filtered_features = filterFeaturesByMutualInformation(percentile)
    print("Percentile: {}, features: {} --> Processing".format(percentile,len(filtered_features.columns)))
    
    results += buildModelCrossValidation(scalers,filtered_features, target, percentile)

end_time = time.time()   
seconds  = end_time- start_time   

print(end_time)
print("{} seconds".format(seconds) )    
print("{} minutes".format(seconds/60) )    


results_df = pd.DataFrame(results, columns = ["Classifier","Percentile","K/Solver", "Scaler", "Score"])
results_df = results_df.sort_values(['Score'], ascending=[False])
results_df

Cross validation
Percentile: 5, features: 11 --> Processing
Percentile: 10, features: 22 --> Processing
Percentile: 50, features: 110 --> Processing
1640650296.0711951
1603.5989246368408 seconds
26.726648743947347 minutes


Unnamed: 0,Classifier,Percentile,K/Solver,Scaler,Score
12,LogisticRegression,50,liblinear,MinMaxScaler,0.672086
13,LogisticRegression,50,sag,MinMaxScaler,0.671878
14,LogisticRegression,50,saga,MinMaxScaler,0.671878
6,LogisticRegression,10,liblinear,Recriprocal,0.671253
0,LogisticRegression,5,newton-cg,Logarithmic,0.671044
2,LogisticRegression,5,sag,Logarithmic,0.671044
1,LogisticRegression,5,lbfgs,Logarithmic,0.671044
7,LogisticRegression,10,newton-cg,Logarithmic,0.670836
8,LogisticRegression,10,lbfgs,Logarithmic,0.670836
3,KNN,5,115,RobustScaler,0.670221
