In [1]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier 
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [2]:
def prepare_data():
    # verinin okunması
    data = pd.read_csv('M30_EURUSD.csv')
    # soru işaretleri olan satırlar veriden silindi.
    data = data.replace("?", np.nan)
    data = data.dropna()
    # verinin feature larının ayrılması
    df = data.iloc[:,5:430]
    df_result = data.iloc[:,430:-1]
    return df, df_result

In [3]:
# parametrik fonk. tanımlaması
# 1 : 
# 2 :

def kategorikleri_dummy_yap(df):
    cat_column_names = ['ind_7','ind_11','ind_24','ind_38','ind_54','ind_57','ind_60','ind_63','ind_66','ind_69','ind_72','ind_75',
                    'ind_78','ind_81','ind_84','ind_87','ind_89','ind_91','ind_93','ind_95','ind_97','ind_99','ind_101',
                    'ind_103','ind_105','ind_107','ind_109', 'ind_111', 'ind_113', 'ind_115','ind_138','ind_141','ind_144',
                    'ind_157','ind_159','ind_161','ind_163','ind_165','ind_167','ind_169','ind_171','ind_173','ind_175',
                    'ind_177','ind_182','ind_184','ind_187','ind_190','ind_193','ind_196','ind_199','ind_202','ind_205',
                    'ind_208','ind_211','ind_213','ind_384','ind_386','ind_388','ind_390']
    # categorical kolonların dummy var. oalrak değiştirdik
    dms = pd.get_dummies(df[cat_column_names])
    dms_none_cols = dms.filter(regex = '_NONE').columns
    for i in dms_none_cols:
        dms.drop(i,axis=1,inplace=True)
    dms_red_cols = dms.filter(regex = '_RED').columns
    for i in dms_red_cols:
        dms.drop(i,axis=1,inplace=True)
    #datadan categorical olan kolonları çıkarıyoruz ve type nı değiştiriyoruz
    df_noncategoric = df.drop(cat_column_names,axis=1).astype("float64")
    df_all = pd.concat([df_noncategoric, dms], axis=1)
    # y değerlerinin alınması
    return df_all, df_noncategoric, dms

In [4]:
def train_test(df, y):
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, shuffle = False)
    return X_train, X_test, y_train, y_test

In [5]:
def corr_df(df, corr_val):
    corr_matrix = df_noncategoric.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_high = [column for column in upper.columns if any(upper[column] > corr_val)]
    df.drop(to_high, axis = 1, inplace = True)
    return df

In [6]:
def multi_logit(X, y):
    logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
    logreg.fit(X, y)
    y_pred = logreg.predict(X)
    confusion_mat = confusion_matrix(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print("Counfusion matrix: \n",confusion_mat)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y, y_pred))

In [7]:
# bütün değişkenlerle yapılan random forest sonucu importance değeri verilen parametreden büyük olan değişkenleri döner
def rand_forest(X, y, imp_value):
    rf_model = RandomForestClassifier().fit(X, y)
    '''
    y_pred = rf_model.predict(X)
    accuracy = accuracy_score(y,y_pred)
    print("Accuracy: ",accuracy)
    print('-------------------------------')
    print('Classification report')
    print(classification_report(y, y_pred))
    '''
    Importance = pd.DataFrame({'Importance':rf_model.feature_importances_*100}, index = X.columns)
    imp_values = Importance.sort_values(by = 'Importance', axis = 0, ascending = True)
    imp_values = imp_values[imp_values['Importance']>imp_value]
    col_names = imp_values.index
    
    return X[col_names]

In [8]:
def pca_fon(X, threshold):
    pca = PCA()
    X_pca = pca.fit_transform(scale(X))
    arr = np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)
    num_var = sum((arr < threshold*100)) + 1 
    print('pca sonrası değişken sayısı: ',num_var)
    X_pcad = pd.DataFrame(X_pca[:,0:num_var], index = X.index)
    return X_pcad

In [9]:
df, df_result = prepare_data()
df_all, df_noncategoric, dms = kategorikleri_dummy_yap(df)

  if (await self.run_code(code, result,  async_=asy)):


In [10]:
df_noncorr = corr_df(df_noncategoric, 0.50)
X = pd.concat([df_noncorr, dms], axis=1)
y =  df_result['220_signal']
df_noncorr.shape

(19901, 83)

In [11]:
pcad = pca_fon(X,0.99)
X.shape, pcad.shape

pca sonrası değişken sayısı:  103


((19901, 200), (19901, 103))

In [12]:
X1 = rand_forest(X, y, 0.0)
X1.shape, X.shape



((19901, 147), (19901, 200))

# yol haritası

1. verilerin sadeleştirilmesi 

    1.1 correlatedları atarak non correlated ları bul 
 
    1.2 RandomForest'dan important değişkenleri bul 
 
    1.3 pca  
 
 
2. algoritmalar 

    2.1 algoritmaları fonk. olarak yaz
 
        2.1.1 loj reg
  
        2.1.2 decision tree
  
        2.1.3 boosting
 
    2.2 1'de bulduğun verilerle bütün algoritmaları çalıştır, sonuçları kıyasla 
 

In [13]:

# multi_logit(X, y)
