# Mileage Exercise

In [1334]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import auc,accuracy_score, mean_squared_error as mse, log_loss
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier, CatBoostRegressor
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder

In [1335]:
n_estimators = 30
xgb_reg = XGBRegressor(n_estimators= n_estimators)
lgbm_reg = LGBMRegressor(n_estimators=n_estimators)
cat_reg = CatBoostRegressor(iterations=n_estimators)
xgb_cls = XGBClassifier(n_estimators=n_estimators) #, objective='multi:logistic') binary:logistic
lgbm_cls = LGBMClassifier(n_estimators=n_estimators, metric= 'multi_logloss')
cat_cls = CatBoostClassifier(iterations=n_estimators) #classes_count= len(set(target))

In [1336]:
path_regerssion = './datasets/regression/'
path_classification = './datasets/classification/'

In [1337]:
# df_results = pd.DataFrame(columns = ['original_results', 'xgb_train', 'xgb_test', 'lgb_train', 'lgb_test', 'cat_train', 'cat_test'])

In [1338]:
# read all kinds of files in path
def read_file_by_kind(path):
    file_kind = file.split('.')[-1]
    if file_kind in ('csv', 'txt'):
        df = pd.read_csv(path + file)
    elif file_kind == 'arff':
        df = pd.DataFrame(arff.loadarff(path+file)[0])
    elif file_kind in ('data', 'dat', 'trn'):
        df = pd.read_table(path+file, delimiter=' ')
    elif file_kind in ('xls', 'xlsx'):
        df = pd.read_excel(path_classification + file)
    else:
        df= pd.DataFrame()
    return df

In [1339]:
# read all regression files
original_reg_df = []
for directory in os.listdir(path_regerssion):
    path = path_regerssion + directory +'/'
    for file in os.listdir(path):
        original_reg_df.append((read_file_by_kind(path), file))

In [1340]:
# read all classification files
original_cls_df = []
for file in os.listdir(path_classification):
    original_cls_df.append((read_file_by_kind(path_classification), file))

In [1341]:
# run all models (regresionqclassification) on df and return scores
def all_models_score(df, target, predict_kind, verbose= 0):
    # predict_kind- regression/classification
    
    score_train= [[],[],[]]
    score_test = [[],[],[]]
    kf = KFold(n_splits= 5, shuffle= True)
    
    # classification models
    if predict_kind == 'cls':
        num_classes = len(set(target.values.flatten()))
        # init cls models
        cat_cls = CatBoostClassifier(iterations= n_estimators, classes_count= num_classes, loss_function= 'MultiClass')
        xgb_cls = XGBClassifier(n_estimators=n_estimators)
        lgbm_cls = LGBMClassifier(n_estimators=n_estimators, metric= 'multi_logloss')
        
        #define models array to be classification
        models = [xgb_cls, lgbm_cls, cat_cls]
        models_name = ['xgb_cls', 'lgbm_cls', 'cat_cls']
        metric = log_loss
        
    # regression models
    else:
        #define models array to be regression
        models = [xgb_reg, lgbm_reg, cat_reg]
        models_name = ['xgb_reg', 'lgbm_reg', 'cat_reg']
        metric = mse
    
    # K FOlD (=5)
    for train_idx, test_idx in kf.split(df):

        # train test split (every fold)
        train, test, target_train, target_test =  df.iloc[train_idx].values, df.iloc[test_idx].values,\
                    target.iloc[train_idx].values, target.iloc[test_idx].values
        target_train = target_train.reshape(len(target_train),)
        
        # fit and predict    
        for i in range(len(models)):
            models[i].fit(train, target_train, verbose= False)
            if predict_kind == 'cls':
                prd_trn = models[i].predict_proba(train)
                prd_tst = models[i].predict_proba(test)
            else:
                prd_trn = models[i].predict(train)
                prd_tst = models[i].predict(test)
            
            score_train[i].append(metric(target_train,prd_trn))
            score_test[i].append(metric(target_test, prd_tst))
    
    # calculate mean scores
    means_train = [round(np.mean(scr),3) for scr in score_train]
    means_test = [round(np.mean(scr),3) for scr in score_test]

    # print scores (by function input)
    if verbose != 0:
        for i in range(len(models)):
            print('{} avg log_loss train: {:.3f}'.format(models_name[i], means_train[i]))
            print('{} avg log_loss test: {:.3f}'.format(models_name[i], means_test[i]))
        if predict_kind != 'cls':
            print('Target Variance is: {:.3f}'.format(np.std(target.values)**2))
            
    return means_train+means_test

In [1342]:
# init processed DF
reg_all_df = []
cls_all_df = []

In [1343]:
def my_categorize(df):
    # one hot (get_dummies) for categorical columns
    cols_to_convert= [col for col in df.columns[df.dtypes == object]]
    return pd.get_dummies(df, dummy_na= True ,columns= cols_to_convert)

In [1344]:
def get_all_scores():
    
    performance_df = pd.DataFrame(columns= ['xgb_train', 'lgbm_train', 'cat_train', 'xgb_test', 'lgbm_test', 'cat_test'])
    
    # regression scores
    for file_idx in range(len(reg_all_df)):
        performance_df.loc[original_reg_df[file_idx][1]] = all_models_score(reg_all_df[file_idx][0],\
                                                                            reg_all_df[file_idx][1], 'reg', verbose= 0)
        print(original_reg_df[file_idx][1])
    
    # classification scores
    for file_idx in range(len(cls_all_df)):
        performance_df.loc[original_cls_df[file_idx][1]] = all_models_score(cls_all_df[file_idx][0],\
                                                                            cls_all_df[file_idx][1], 'cls', verbose= 0)
        print(original_cls_df[file_idx][1])
        
    return performance_df

# Regression

### 0- autoHorse.arff

In [1345]:
df = original_reg_df[0][0]

In [1346]:
df.head(2)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,peak-rpm,city-mpg,highway-mpg,price,class
0,3.0,,b'alfa-romero',b'gas',b'std',2.0,b'convertible',b'rwd',b'front',88.6,...,130.0,b'mpfi',3.47,2.68,9.0,5000.0,21.0,27.0,13495.0,111.0
1,3.0,,b'alfa-romero',b'gas',b'std',2.0,b'convertible',b'rwd',b'front',88.6,...,130.0,b'mpfi',3.47,2.68,9.0,5000.0,21.0,27.0,16500.0,111.0


In [1347]:
df = df.dropna(axis=0)

In [1348]:
# separate Features and Target
target = df['price'].to_frame()
df = df.drop('price', axis=1)

In [1349]:
df = my_categorize(df)

In [1350]:
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1351]:
reg_all_df.append((df,target))

### 1- autoPrice.arff

In [1352]:
df = original_reg_df[1][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1353]:
# all_models_score(df, target, 'reg', original_reg_df[1][1])

In [1354]:
reg_all_df.append((df,target))

### 2- fishcatch.arff

In [1355]:
df = original_reg_df[2][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1356]:
df = my_categorize(df)

In [1357]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1358]:
# all_models_score(df, target, 'reg',original_reg_df[2][1])

In [1359]:
reg_all_df.append((df,target))

### 3- pbc.arff

In [1360]:
df = original_reg_df[3][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1361]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1362]:
df = my_categorize(df)

In [1363]:
# all_models_score(df, target, original_reg_df[3][1])

In [1364]:
reg_all_df.append((df, target))

### 4- pharynx.arff

In [1365]:
df = original_reg_df[4][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1366]:
df.dtypes

Inst          object
sex           object
Treatment     object
Grade         object
Age          float64
Condition     object
Site          object
T             object
N             object
Entry         object
Status        object
dtype: object

In [1367]:
df.head()

Unnamed: 0,Inst,sex,Treatment,Grade,Age,Condition,Site,T,N,Entry,Status
0,b'2',b'2',b'1',b'1',51.0,b'1',b'2',b'3',b'1',b'2468',b'1'
1,b'2',b'1',b'2',b'1',65.0,b'1',b'4',b'2',b'3',b'2968',b'1'
2,b'2',b'1',b'1',b'2',64.0,b'2',b'1',b'3',b'3',b'3368',b'1'
3,b'2',b'1',b'1',b'1',73.0,b'1',b'1',b'4',b'0',b'5768',b'1'
4,b'5',b'1',b'2',b'2',64.0,b'1',b'1',b'4',b'3',b'9568',b'1'


In [1368]:
# There are values that can't convert to int:
for col in df.columns[df.dtypes == object]:
    print(col,': ',set(df[col].values))

Inst :  {b'3', b'4', b'5', b'2', b'6', b'1'}
sex :  {b'2', b'1'}
Treatment :  {b'2', b'1'}
Grade :  {b'3', b'2', b'?', b'1'}
Condition :  {b'3', b'4', b'0', b'2', b'?', b'1'}
Site :  {b'4', b'2', b'1'}
T :  {b'3', b'4', b'2', b'1'}
N :  {b'3', b'2', b'0', b'1'}
Entry :  {b'30469', b'4970', b'20570', b'26371', b'17470', b'32171', b'17869', b'15971', b'9670', b'33568', b'13569', b'28770', b'15471', b'3368', b'3572', b'15468', b'8871', b'24872', b'5672', b'34170', b'34470', b'18468', b'28968', b'8272', b'2769', b'24569', b'1571', b'7771', b'16769', b'13269', b'4371', b'32468', b'1871', b'8072', b'13671', b'14372', b'22768', b'29868', b'34270', b'20768', b'6771', b'5472', b'16669', b'28969', b'2968', b'11070', b'21871', b'5972', b'33868', b'32371', b'16070', b'769', b'5169', b'29971', b'12371', b'29069', b'8270', b'15371', b'25470', b'21170', b'34771', b'7870', b'26669', b'32671', b'28069', b'5572', b'2671', b'14470', b'34271', b'7571', b'22772', b'10668', b'14369', b'35369', b'18371', b'2

In [1369]:
# clean b'?' that can't convert to float and converting to float
for col in df.columns[df.dtypes == object]:
    # clean b'?'
    if b'?' in set(df[col].values):
        index_to_drop = df[df[col] == b'?'].index[:]
        df = df.drop(index= index_to_drop)
    # convert byte to float
    df[col] = df[col].apply(lambda x: float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [1370]:
# all_models_score(df, target,'reg' ,original_reg_df[4][1])

In [1371]:
reg_all_df.append((df,target))

### 5- 'airfoil-self-noise.csv'

In [1372]:
df = original_reg_df[5][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1373]:
reg_all_df.append((df,target))

### 6- ASP.csv

In [1374]:
df = original_reg_df[6][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[1:-1]]

In [1375]:
df.dtypes

m7497.96    float64
m7496.04    float64
m7494.11    float64
m7492.18    float64
m7490.25    float64
m7488.32    float64
m7486.39    float64
m7484.46    float64
m7482.54    float64
m7480.61    float64
m7478.68    float64
m7476.75    float64
m7474.82    float64
m7472.89    float64
m7470.97    float64
m7469.04    float64
m7467.11    float64
m7465.18    float64
m7463.25    float64
m7461.32    float64
m7459.39    float64
m7457.47    float64
m7455.54    float64
m7453.61    float64
m7451.68    float64
m7449.75    float64
m7447.82    float64
m7445.89    float64
m7443.97    float64
m7442.04    float64
             ...   
m617.116    float64
m615.188    float64
m613.259    float64
m611.331    float64
m609.402    float64
m607.474    float64
m605.545    float64
m603.617    float64
m601.688    float64
m599.76     float64
BSAN        float64
BSAS        float64
BSAV        float64
CTI         float64
ELEV        float64
EVI         float64
LSTD        float64
LSTN        float64
REF1        float64


In [1376]:
# number of not numerical
sum(df.dtypes!='float64')

1

In [1377]:
np.argmax(df.dtypes!='float64')

  return getattr(obj, method)(*args, **kwds)


'Depth'

In [1378]:
## categorize to int
# df['Depth'] = pd.factorize(df['Depth'])[0]

In [1379]:
df = my_categorize(df)

In [1380]:
reg_all_df.append((df,target))

### 7- CASP

In [1381]:
df = original_reg_df[7][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1382]:
reg_all_df.append((df,target))

### 8- Concrete_Data_Yeh.csv

In [1383]:
df = original_reg_df[8][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1384]:
reg_all_df.append((df,target))

### 9 - house.csv

In [1385]:
df= original_reg_df[9][0]

# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1386]:
reg_all_df.append((df,target))

### 10- insurance.csv

In [1387]:
df = original_reg_df[10][0]

In [1388]:
# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1389]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [1390]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1391]:
df = my_categorize(df)

In [1392]:
reg_all_df.append((df,target))

### 11-  iowa-houseprice.csv

In [1393]:
df = original_reg_df[11][0]
# separate Features and Target
target = df[df.columns[-1]].to_frame()
df = df[df.columns[:-1]]

In [1394]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1395]:
df = my_categorize(df)

In [1396]:
reg_all_df.append((df,target))

### 12- Life-Expectancy-Data.csv

In [1397]:
df = original_reg_df[12][0]

In [1398]:
# clean from NAN
df = df.dropna()

In [1399]:
# separate Features and Target
target = df[df.columns[3]]
df = df.drop(df.columns[3], axis=1)

In [1400]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1401]:
df = my_categorize(df)

In [1402]:
reg_all_df.append((df,target))

### 13- OnlineNewsPopularity.csv

In [1403]:
df = original_reg_df[13][0]

# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1404]:
df = df.drop(columns='url')

In [1405]:
reg_all_df.append((df,target))

### 14- prolife.csv

In [1406]:
df = original_reg_df[14][0]

# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[[0,-1]], axis=1)

In [1407]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1408]:
df = my_categorize(df)

In [1409]:
reg_all_df.append((df,target))

### 15- servo.csv

In [1410]:
df = original_reg_df[15][0]

# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1411]:
df = my_categorize(df)

In [1412]:
# # categorize to int
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1413]:
reg_all_df.append((df,target))

### 16- student-math.csv

In [1414]:
df = original_reg_df[16][0]

In [1415]:
df.head(3)

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;""F"";18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";17;""U"";""GT3"";""T"";1;1;""at_home"";""other"";..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."


we need split columns and data by ';' sign

In [1416]:
# split data and columns and insert to new df
splitted_data = df.apply(lambda x: ((x[0]).split(';')), axis=1)
data = [row for row in splitted_data]
df = pd.DataFrame(columns=df.columns[0].split(';'), data= data)

In [1417]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,"""F""",18,"""U""","""GT3""","""A""",4,4,"""at_home""","""teacher""",...,4,3,4,1,1,3,6,"""5""","""6""",6
1,GP,"""F""",17,"""U""","""GT3""","""T""",1,1,"""at_home""","""other""",...,5,3,3,1,1,3,4,"""5""","""5""",6
2,GP,"""F""",15,"""U""","""LE3""","""T""",1,1,"""at_home""","""other""",...,4,3,2,2,3,3,10,"""7""","""8""",10
3,GP,"""F""",15,"""U""","""GT3""","""T""",4,2,"""health""","""services""",...,3,2,2,1,1,5,2,"""15""","""14""",15
4,GP,"""F""",16,"""U""","""GT3""","""T""",3,3,"""other""","""other""",...,4,3,2,1,2,5,4,"""6""","""10""",10


In [1418]:
for col in df.columns:
    # drop ' " ' sign from string
    df[col] = df[col].apply(lambda x: x.replace('"',''))
    # convert str to int (what possible)
    df[col] = df[col].apply(lambda x: int(x) if x.isdigit() else x)

In [1419]:
# # categorize
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1420]:
df = my_categorize(df)

In [1421]:
# separate Features and Target
target = df['G3']
df = df.drop(columns='G3')

In [1422]:
reg_all_df.append((df,target))

### 17- winequality-white.csv

In [1423]:
df = original_reg_df[17][0]

In [1424]:
df.head(3)

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...


In [1425]:
# split data and columns and insert to new df
splitted_data = df.apply(lambda x: ((x[0]).split(';')), axis=1)
data = [row for row in splitted_data]
df = pd.DataFrame(columns=df.columns[0].split(';'), data= data)

In [1426]:
df.dtypes

fixed acidity             object
"volatile acidity"        object
"citric acid"             object
"residual sugar"          object
"chlorides"               object
"free sulfur dioxide"     object
"total sulfur dioxide"    object
"density"                 object
"pH"                      object
"sulphates"               object
"alcohol"                 object
"quality"                 object
dtype: object

In [1427]:
# convert str to float
for col in df.columns:
    df[col] = df[col].apply(float)

In [1428]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1429]:
reg_all_df.append((df,target))

### 18- 2dplane.data

We will read this data again, with 'delimiter':

In [1430]:
df = pd.read_table('./datasets/regression/data_files/2dplane.data', delimiter= '  ')

  """Entry point for launching an IPython kernel.


In [1431]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1432]:
# all_models_score(df, target,'aa')

In [1433]:
reg_all_df.append((df,target))

### 19- abalone.data

We will read this data again, with 'delimiter' of ',':

In [1434]:
df = pd.read_table('./datasets/regression/data_files/abalone.data', delimiter= ',')

In [1435]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1436]:
# #categorize
# df['M'] = pd.factorize(df['M'])[0]

In [1437]:
df = my_categorize(df)

In [1438]:
reg_all_df.append((df,target))

.

# CLASSIFICATION

.

### 0- adult.data

In [1439]:
# read data with delimiter ','
df = pd.read_table(path_classification+'adult.data' , delimiter=',')

In [1440]:
df.head(2)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [1441]:
df.columns

Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')

In [1442]:
columns_convert_to_int = df.columns[[0,2,4,10,11,12]]

In [1443]:
# convert to int
for col in columns_convert_to_int:
    df[col] = df[col].apply(int)

In [1444]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1445]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1446]:
df = my_categorize(df)

In [1447]:
df.dtypes

39                                            int64
 77516                                        int64
 13                                           int64
 2174                                         int64
 0                                            int64
 40                                           int64
 State-gov_ ?                                 uint8
 State-gov_ Federal-gov                       uint8
 State-gov_ Local-gov                         uint8
 State-gov_ Never-worked                      uint8
 State-gov_ Private                           uint8
 State-gov_ Self-emp-inc                      uint8
 State-gov_ Self-emp-not-inc                  uint8
 State-gov_ State-gov                         uint8
 State-gov_ Without-pay                       uint8
 State-gov_nan                                uint8
 Bachelors_ 10th                              uint8
 Bachelors_ 11th                              uint8
 Bachelors_ 12th                              uint8
 Bachelors_ 

In [1448]:
# all_models_score(df, target, 'cls','gguu')

In [1449]:
cls_all_df.append((df,target))

### 1- breast-cancer-wisconsin.csv

In [1450]:
df = original_cls_df[1][0]

In [1451]:
# separate Features and Target
target = df[df.columns[1]]
df = df.drop(df.columns[[0,1]], axis=1)

In [1452]:
target = pd.DataFrame((pd.factorize(target))[0])

In [1453]:
# all_models_score(df, target, 'cls','gguu')

In [1454]:
cls_all_df.append((df,target))

### 2- car.data

we will read the data with delimiter:

In [1455]:
df = pd.read_table(path_classification+'car.data', delimiter=',')

In [1456]:
for col in df.columns:
    print(set(df[col]))

{'vhigh', 'med', 'high', 'low'}
{'vhigh', 'med', 'high', 'low'}
{'3', '4', '2', '5more'}
{'more', '4', '2'}
{'small', 'med', 'big'}
{'med', 'high', 'low'}
{'vgood', 'good', 'unacc', 'acc'}


In [1457]:
# # categorize
# for col in df.columns[df.dtypes == object]:
#     df[col] = pd.factorize(df[col])[0]

In [1458]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1459]:
target = pd.DataFrame((pd.factorize(target))[0])

In [1460]:
df = my_categorize(df)

In [1461]:
cls_all_df.append((df,target))

In [1462]:
# all_models_score(df, target,'cls','fff')

### 3- cmc.data

In [1463]:
df = pd.read_table(path_classification + 'cmc.data', delimiter=',')

In [1464]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1465]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1466]:
# all_models_score(df, target, 'cls','aaaaa')

In [1467]:
cls_all_df.append((df,target))

### 4- crx.data

In [1468]:
df = pd.read_table(path_classification + 'crx.data', delimiter=',')

In [1469]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1470]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1471]:
df = my_categorize(df)

In [1472]:
# all_models_score(df, target,'cls','ccc')

In [1473]:
cls_all_df.append((df,target))

### 5- CTG.xls

read specific excel sheet:

In [1474]:
df = pd.read_excel(path_classification + 'CTG.xls', sheet_name= 2)

In [1475]:
# cleaning data
df = df.iloc[1:]
df = df.drop(columns= ['FileName', 'Date', 'SegFile'])
df = df.dropna()

In [1476]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1477]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1478]:
# all_models_score(df, target,'cls','gg')

In [1479]:
cls_all_df.append((df,target))

### 6- data-banknote-authentication.txt

In [1480]:
df = original_cls_df[6][0]

In [1481]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1482]:
# all_models_score(df, target,'cls','gg')

In [1483]:
cls_all_df.append((df,target))

### 7- Data-Cortex-Nuclear

In [1484]:
df = original_cls_df[7][0]

In [1485]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1486]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1487]:
df = my_categorize(df)

In [1488]:
# all_models_score(df, target, 'cls', 'ff') # work well, but it's heavy...............

In [1489]:
cls_all_df.append((df,target))

### 8- flare.csv

In [1490]:
df = original_cls_df[8][0]

In [1491]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1492]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1493]:
df = my_categorize(df)

In [1494]:
# all_models_score(df, target, 'cls', 'fffff')

In [1495]:
cls_all_df.append((df,target))

### 9- Frogs-MFCCs.csv

In [1496]:
df = original_cls_df[9][0]

In [1497]:
# merge the multiple labels
target = df.apply(lambda x: x['Family']+' '+x['Genus']+' '+x['Species'] , axis= 1)
df = df.drop(columns=['Family', 'Genus', 'Species'])

# clean unrellevant data
df = df.drop(columns=['RecordID'])

In [1498]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1499]:
# all_models_score(df, target, 'cls', 'fffff')

In [1500]:
cls_all_df.append((df,target))

### 10- heart.dat

In [1501]:
df = original_cls_df[10][0]

In [1502]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1503]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1504]:
# all_models_score(df, target, 'cls', 'fffff')

In [1505]:
cls_all_df.append((df,target))

### 11- messidor-features.arff

In [1506]:
df = original_cls_df[11][0]

In [1507]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1508]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1509]:
# all_models_score(df, target, 'cls', 'qq')

In [1510]:
cls_all_df.append((df,target))

### 12- MEU-Mobile KSD 2016.xlsx

In [1511]:
df= original_cls_df[12][0]

# clean empty rows 
df = df.iloc[4:]

# PARTIAL DATA!!

In [1512]:
#Too much Heavy for CATBoost 
choose_partial_data = 800 
df = df.iloc[:choose_partial_data]

In [1513]:
# edit df
new_df = pd.DataFrame()
for col in df.columns:
    new_df[col] = df[col].tolist()

In [1514]:
# clean Nan
new_df = new_df.dropna(axis=0)

In [1515]:
# separate Features and Target
new_target = new_df[new_df.columns[0]]
new_df = new_df.drop(new_df.columns[0], axis=1)

In [1516]:
new_target = pd.DataFrame(pd.factorize(new_target)[0])

In [1517]:
# all_models_score(new_df, target, 'cls', 'qqq')

In [1518]:
cls_all_df.append((new_df,new_target))

### 13- nursery.data

In [1519]:
df= pd.read_table(path_classification+'nursery.data', delimiter=',')

In [1520]:
df.head()

Unnamed: 0,usual,proper,complete,1,convenient,convenient.1,nonprob,recommended,recommend
0,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
1,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
2,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
3,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
4,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom


In [1521]:
df.dtypes

usual           object
proper          object
complete        object
1               object
convenient      object
convenient.1    object
nonprob         object
recommended     object
recommend       object
dtype: object

In [1522]:
# separate Features and Target
target = df[df.columns[1]]
df = df.drop(df.columns[1], axis=1)

In [1523]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1524]:
df = my_categorize(df)

In [1525]:
# all_models_score(df, target, 'cls', 'qqq')

In [1526]:
cls_all_df.append((df,target))

### 14- parkinson.txt

In [1527]:
df = original_cls_df[14][0]

In [1528]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1529]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1530]:
# all_models_score(df, target, 'cls', 'qqq')

In [1531]:
cls_all_df.append((df,target))

### 15- pendigits-train.txt

In [1532]:
df = original_cls_df[15][0]

In [1533]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1534]:
# all_models_score(df, target, 'cls', 'qqq')

In [1535]:
cls_all_df.append((df,target))

### 16- PhishingData.arff

In [1536]:
df = original_cls_df[16][0]

In [1537]:
for col in df.columns:
    temp = set(df[col].tolist())
    if len(temp)<20:
        print(col, ':' ,set(df[col].tolist()))

SFH : {b'-1', b'0', b'1'}
popUpWidnow : {b'-1', b'0', b'1'}
SSLfinal_State : {b'-1', b'0', b'1'}
Request_URL : {b'-1', b'0', b'1'}
URL_of_Anchor : {b'-1', b'0', b'1'}
web_traffic : {b'-1', b'0', b'1'}
URL_Length : {b'-1', b'0', b'1'}
age_of_domain : {b'-1', b'1'}
having_IP_Address : {b'0', b'1'}
Result : {b'-1', b'0', b'1'}


In [1538]:
# separate Features and Target
target = df[df.columns[1]]
df = df.drop(df.columns[1], axis=1)

In [1539]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1540]:
df = my_categorize(df)

In [1541]:
# all_models_score(df, target, 'cls', 'wqq')

In [1542]:
cls_all_df.append((df,target))

## This was problematic data, don't insert

### 17#- shuttle.trn

In [1543]:
# df = original_cls_df[17___][0]

In [1544]:
# # separate Features and Target
# target = df[df.columns[-1]]
# df = df.drop(df.columns[-1], axis=1)

In [1545]:
# target = pd.DataFrame(pd.factorize(target)[0])

In [1546]:
# target[0].value_counts()

In [1547]:
# all_models_score(df, target, 'cls', 0)

In [1548]:
# cls_all_df.append((df,target))

### 17- titanic-train.csv

In [1549]:
df = original_cls_df[17][0]

In [1550]:
df = df.copy().drop(columns= ['Name', 'Cabin', 'Ticket'])

In [1551]:
df = df.dropna()

In [1552]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1553]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1554]:
df = my_categorize(df)

In [1555]:
# all_models_score(df, target, 'cls', 'wwqq')

In [1556]:
cls_all_df.append((df,target))

### 18- voice-gender.csv

In [1557]:
df = original_cls_df[18][0]

In [1558]:
# separate Features and Target
target = df[df.columns[-1]]
df = df.drop(df.columns[-1], axis=1)

In [1559]:
target = pd.DataFrame(pd.factorize(target)[0])

In [1560]:
# all_models_score(df, target, 'cls', 'wwwqq')

In [1561]:
cls_all_df.append((df,target))

# .
# Get All Scores:
# .

In [1562]:
final_df = get_all_scores()

autoHorse.arff
autoPrice.arff
fishcatch.arff
pbc.arff
pharynx.arff
airfoil-self-noise.csv
ASP.csv
CASP.csv
Concrete_Data_Yeh.csv
house.csv
insurance.csv
iowa-houseprice.csv
Life-Expectancy-Data.csv
OnlineNewsPopularity.csv
prolife.csv
servo.csv
student-math.csv
winequality-white.csv
2dplane.data
abalone.data
adult.data
breast-cancer-wisconsin.csv
car.data
cmc.data
crx.data
CTG.xls
data-banknote-authentication.txt
Data-Cortex-Nuclear.xls
flare.csv
Frogs-MFCCs.csv
heart.dat
messidor-features.arff
MEU-Mobile KSD 2016.xlsx
nursery.data
parkinson.txt
pendigits-train.txt
PhishingData.arff
titanic-train.csv
voice-gender.csv


In [1563]:
final_df

Unnamed: 0,xgb_train,lgbm_train,cat_train,xgb_test,lgbm_test,cat_test
autoHorse.arff,1552893.0,4929739.0,46399610.0,5356159.0,8714213.0,50409270.0
autoPrice.arff,1554388.0,4940683.0,46543170.0,5184658.0,6996745.0,50717430.0
fishcatch.arff,2471.42,17085.32,85497.99,6723.251,23018.82,90975.82
pbc.arff,421227.5,332774.0,1628693.0,815840.5,807035.2,1691598.0
pharynx.arff,54826.5,79789.34,198311.5,120980.3,115753.4,222760.2
airfoil-self-noise.csv,42.546,4.303,2683.096,44.006,6.021,2692.469
ASP.csv,0.112,0.038,0.414,0.188,0.164,0.44
CASP.csv,25.311,19.268,39.293,25.611,20.134,39.36
Concrete_Data_Yeh.csv,40.235,15.404,379.777,49.587,31.12,388.177
house.csv,1307570000.0,755594700.0,2190107000.0,1401440000.0,1065745000.0,2219377000.0


In [1572]:
final_df.to_csv('yoel_results.csv')