In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier 
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier
import warnings            
warnings.filterwarnings("ignore")

In [2]:
def prepare_data():
    # verinin okunması
    data = pd.read_csv('M30_EURUSD.csv',low_memory=False)
    # soru işaretleri olan satırlar veriden silindi.
    data = data.replace("?", np.nan)
    # data = data.dropna()
    # verinin feature larının ayrılması
    df = data.iloc[:,5:430]
    df_first = data.iloc[:,1:5]
    df_result = data.iloc[:,430:-1]
    return df, df_first, df_result

In [3]:
# parametrik fonk. tanımlaması
# 1 : 
# 2 :

def kategorikleri_dummy_yap(df):
    cat_column_names = ['ind_7','ind_11','ind_24','ind_38','ind_54','ind_57','ind_60','ind_63','ind_66','ind_69','ind_72','ind_75',
                    'ind_78','ind_81','ind_84','ind_87','ind_89','ind_91','ind_93','ind_95','ind_97','ind_99','ind_101',
                    'ind_103','ind_105','ind_107','ind_109', 'ind_111', 'ind_113', 'ind_115','ind_138','ind_141','ind_144',
                    'ind_157','ind_159','ind_161','ind_163','ind_165','ind_167','ind_169','ind_171','ind_173','ind_175',
                    'ind_177','ind_182','ind_184','ind_187','ind_190','ind_193','ind_196','ind_199','ind_202','ind_205',
                    'ind_208','ind_211','ind_213','ind_384','ind_386','ind_388','ind_390']
    # categorical kolonların dummy var. oalrak değiştirdik
    dms = pd.get_dummies(df[cat_column_names])
    dms_none_cols = dms.filter(regex = '_NONE').columns
    for i in dms_none_cols:
        dms.drop(i,axis=1,inplace=True)
    dms_red_cols = dms.filter(regex = '_RED').columns
    for i in dms_red_cols:
        dms.drop(i,axis=1,inplace=True)
    #datadan categorical olan kolonları çıkarıyoruz ve type nı değiştiriyoruz
    df_noncategoric = df.drop(cat_column_names,axis=1).astype("float64")
    from sklearn.preprocessing import Imputer 
    imputer= Imputer(missing_values='NaN', strategy = 'mean', axis=0 ) 
    imputer = imputer.fit(df_noncategoric)   
    df_noncategoric = imputer.transform(df_noncategoric) 
    df_noncategoric = pd.DataFrame(df_noncategoric)
    df_all = pd.concat([df_noncategoric, dms], axis=1)
    # y değerlerinin alınması
    return df_all, df_noncategoric, dms

In [4]:
# 1.1 - dropping correlaritions
def corr_df(df, corr_val):
    corr_matrix = df_noncategoric.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_high = [column for column in upper.columns if any(upper[column] > corr_val)]
    df.drop(to_high, axis = 1, inplace = True)
    return df

In [5]:
# 1.2 RandomForest
# bütün değişkenlerle yapılan random forest sonucu importance değeri verilen parametreden büyük olan değişkenleri döner
def rand_forest(X, y, imp_value):
    rf_model = RandomForestClassifier().fit(X, y)
    Importance = pd.DataFrame({'Importance':rf_model.feature_importances_*100}, index = X.columns)
    imp_values = Importance.sort_values(by = 'Importance', axis = 0, ascending = True)
    imp_values = imp_values[imp_values['Importance']>imp_value]
    col_names = imp_values.index   
    return X[col_names]

In [6]:
# 1.3 - pca
def pca_fon(X, threshold):
    pca = PCA()
    X_pca = pca.fit_transform(scale(X))
    arr = np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)
    num_var = sum((arr < threshold*100)) + 1 
    print('pca sonrası değişken sayısı: ',num_var)
    X_pcad = pd.DataFrame(X_pca[:,0:num_var], index = X.index)
    return X_pcad

In [7]:
def splitting(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)
    return X_train, X_test, y_train, y_test

In [8]:
# 2.1.1 - multi lojistik
def multi_logit(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
    log = logreg.fit(X_train, y_train)
    y_pred = log.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [9]:
# 2.1.2 - decision tree
def dec_tree(X_train, X_test, y_train, y_test):
    cart = DecisionTreeClassifier()
    cart_model = cart.fit(X_train, y_train)
    y_pred = cart_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [10]:
def grad_boost(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import GradientBoostingClassifier
    print('grad_boost----------------')
    gbm_model = GradientBoostingClassifier().fit(X_train, y_train)
    y_pred = gbm_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [11]:
def xgb_boost(X_train, X_test, y_train, y_test):
    from xgboost import XGBClassifier
    print('xgb_boost----------------')
    xgb_model = XGBClassifier().fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [12]:
def lightGBM(X_train, X_test, y_train, y_test):
    from lightgbm import LGBMClassifier
    print('lightGBM----------------')
    lgbm_model = LGBMClassifier(verbose=-1).fit(X_train,y_train)    
    y_pred = lgbm_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [13]:
def catBoost(X_train, X_test, y_train, y_test):
    from catboost import CatBoostClassifier
    print('CatBoost----------------')
    cat_model = CatBoostClassifier().fit(X_train, y_train)
    y_pred = cat_model.predict(X_test)
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y_test, y_pred))

In [14]:
# 2.1.3 - Boosting
def boostings(X_train, X_test, y_train, y_test):
    grad_boost(X_train, X_test, y_train, y_test)
    xgb_boost(X_train, X_test, y_train, y_test)
    # lightGBM(X_train, X_test, y_train, y_test)
    # catBoost(X_train, X_test, y_train, y_test)

In [15]:
# verinin okunması - df: ilk 5 kolon ve result'lar hariç kolonlar, df_first: ilk 5 kolon, df_result: sonuç kolonları
df, df_first, df_result = prepare_data()

In [35]:
# y sütunlarında geçen verilerden en çok tekrar edenler y sütunu olarak alınmıştır.
# desc = y.T.describe(include='all')
y_max1 = df_result.mode(axis=1)
y = pd.DataFrame(y_max1[0])
y.head()

Unnamed: 0,0
0,BUY
1,BUY
2,BUY
3,BUY
4,BUY


In [17]:
# 1.1 den gelen veriler (non correlatedlardan gelenler)
df_all, df_noncategoric, dms = kategorikleri_dummy_yap(df)
df_noncorr = corr_df(df_noncategoric, 0.50)
X1_1 = pd.concat([df_first, df_noncorr, dms], axis=1)


In [18]:
# 1.2 den gelen veriler. 
# Notlar: 
# 1 - y için iterasyon denenebilir. y kolonu '220_signal' seçilmiştir.
# 2- importance treshold'u 0.05 seçilmiştir, cv yapılabilir.

X_raw = pd.concat([df_first,df_all], axis=1) 
X1_2 = rand_forest(X_raw, y, 0.05)    

In [19]:
# 1.3 den gelen veriler.
X_raw2 = pd.concat([df_first,df_all], axis=1) 
X1_3 = pca_fon(X_raw2, 0.99)
X_raw2.shape, X1_3.shape

pca sonrası değişken sayısı:  201


((27994, 485), (27994, 201))

In [28]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.10)
print('X1_1 için multi log')
multi_logit(X_train1, X_test1, y_train, y_test)
print('*************************************************************************************')

X1_1 için multi log
Accuracy:  0.35535714285714287
-------------------------------
Counfusion matrix: 
 [[382   0 473]
 [338   0 161]
 [833   0 613]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.25      0.45      0.32       855
        NONE       0.00      0.00      0.00       499
        SELL       0.49      0.42      0.46      1446

    accuracy                           0.36      2800
   macro avg       0.25      0.29      0.26      2800
weighted avg       0.33      0.36      0.33      2800

*************************************************************************************


In [29]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.10)
print('X1_2 için multi log')
multi_logit(X_train2, X_test2, y_train, y_test)
print('*************************************************************************************')

X1_2 için multi log
Accuracy:  0.37107142857142855
-------------------------------
Counfusion matrix: 
 [[523   3 329]
 [391   0 108]
 [930   0 516]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.28      0.61      0.39       855
        NONE       0.00      0.00      0.00       499
        SELL       0.54      0.36      0.43      1446

    accuracy                           0.37      2800
   macro avg       0.28      0.32      0.27      2800
weighted avg       0.37      0.37      0.34      2800

*************************************************************************************


In [30]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.10)
print('X1_3 için multi log')
multi_logit(X_train3, X_test3, y_train, y_test)
print('*************************************************************************************')

X1_3 için multi log
Accuracy:  0.3225
-------------------------------
Counfusion matrix: 
 [[ 648   45  162]
 [ 453   19   27]
 [1118   92  236]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.29      0.76      0.42       855
        NONE       0.12      0.04      0.06       499
        SELL       0.56      0.16      0.25      1446

    accuracy                           0.32      2800
   macro avg       0.32      0.32      0.24      2800
weighted avg       0.40      0.32      0.27      2800

*************************************************************************************


In [31]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.10)
print('X1_1 için dec tree')
dec_tree(X_train1, X_test1, y_train, y_test)
print('*************************************************************************************')

X1_1 için dec tree
Accuracy:  0.37392857142857144
-------------------------------
Counfusion matrix: 
 [[288 288 279]
 [223 173 103]
 [383 477 586]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.32      0.34      0.33       855
        NONE       0.18      0.35      0.24       499
        SELL       0.61      0.41      0.49      1446

    accuracy                           0.37      2800
   macro avg       0.37      0.36      0.35      2800
weighted avg       0.44      0.37      0.39      2800

*************************************************************************************


In [33]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.10)
print('X1_2 için dec tree')
dec_tree(X_train2, X_test2, y_train, y_test)
print('*************************************************************************************')

X1_2 için dec tree
Accuracy:  0.3142857142857143
-------------------------------
Counfusion matrix: 
 [[369 305 181]
 [174 121 204]
 [632 424 390]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.31      0.43      0.36       855
        NONE       0.14      0.24      0.18       499
        SELL       0.50      0.27      0.35      1446

    accuracy                           0.31      2800
   macro avg       0.32      0.31      0.30      2800
weighted avg       0.38      0.31      0.32      2800

*************************************************************************************


In [34]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.10)
for i in y.columns:
    print(i, ' kolonu için sonuçlar:')
    dec_tree(X_train3, X_test3, y_train[i], y_test[i])
    print('*************************************************************************************')

0  kolonu için sonuçlar:
Accuracy:  0.36678571428571427
-------------------------------
Counfusion matrix: 
 [[416  64 375]
 [225  32 242]
 [749 118 579]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.30      0.49      0.37       855
        NONE       0.15      0.06      0.09       499
        SELL       0.48      0.40      0.44      1446

    accuracy                           0.37      2800
   macro avg       0.31      0.32      0.30      2800
weighted avg       0.37      0.37      0.36      2800

*************************************************************************************


In [26]:
X_train1, X_test1, y_train, y_test = splitting(X1_1, y, 0.30)
for i in y.columns:
    print(i, ' kolonu için sonuçlar:')
    boostings(X_train1, X_test1, y_train[i], y_test[i])
    print('*************************************************************************************')

0  kolonu için sonuçlar:
grad_boost----------------
Accuracy:  0.39659483271818075
-------------------------------
Counfusion matrix: 
 [[2746    0  312]
 [1386    9  149]
 [3215    6  576]]
-------------------------------
Classification report
              precision    recall  f1-score   support

         BUY       0.37      0.90      0.53      3058
        NONE       0.60      0.01      0.01      1544
        SELL       0.56      0.15      0.24      3797

    accuracy                           0.40      8399
   macro avg       0.51      0.35      0.26      8399
weighted avg       0.50      0.40      0.30      8399

xgb_boost----------------


KeyboardInterrupt: 

In [None]:
X_train2, X_test2, y_train, y_test = splitting(X1_2, y, 0.30)
for i in y.columns:
    print(i, ' kolonu için sonuçlar:')
    boostings(X_train2, X_test2, y_train[i], y_test[i])
    print('*************************************************************************************')

In [None]:
X_train3, X_test3, y_train, y_test = splitting(X1_3, y, 0.30)
for i in y.columns:
    print(i, ' kolonu için sonuçlar:')
    boostings(X_train3, X_test3, y_train[i], y_test[i])
    print('*************************************************************************************')

# yol haritası

1. verilerin sadeleştirilmesi 

    1.1 correlatedları atarak non correlated ları bul 
 
    1.2 RandomForest'dan important değişkenleri bul 
 
    1.3 pca  
 
 
2. algoritmalar 

    2.1 algoritmaları fonk. olarak yaz
 
        2.1.1 loj reg
  
        2.1.2 decision tree
  
        2.1.3 boosting
      
    2.2 cross validations
 
    2.3 1'de bulduğun verilerle bütün algoritmaları çalıştır, sonuçları kıyasla 
 
 Notlar:
 - ilk 5 sütun correlation a koyulmadı. bunların da koyulması gerekir mi?

In [None]:

# multi_logit(X, y)
