In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import os
import glob
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
#REGRESJA LOGISTYCZNA
def regresja(df, nazwa_modeli):
    X  = df.drop(['las01', 'owner', 'Unnamed: 0'], axis=1) 
    Y = df.las01

    # splasczenie zeminnej zaleznej bo musi byc splaszczona
    y = np.ravel(Y)

    # podzial na test i train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

    # regresja
    log_model = LogisticRegression()
    log_model.fit(X_train,y_train)
    r2 = log_model.score(X,Y)
    r2_test = log_model.score(X_test,y_test)
    print(r2, r2_test)
    
    # Predict the labels of the test set: y_pred
    y_pred = log_model.predict(X_test)
    y_pred_prob = log_model.predict_proba(X_test)[:,1]
    
    # Compute and print AUC score
    auc_kurwa = roc_auc_score(y_test, y_pred_prob)
    print("AUC: {}".format(auc_kurwa), nazwa_modeli)
    
    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

    # Compute and print the confusion matrix and classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # standardowy output regresji
    coef_df = DataFrame(zip(X.columns, np.transpose(log_model.coef_)))
    return(coef_df, r2, auc_kurwa, nazwa_modeli, r2_test)

In [None]:
def czyszczenie(df, cols_to_norm):
    df2 = df.replace(-9999, np.NaN)
    df3 = df2.dropna()
     
    ##normalizacja 
    df3[cols_to_norm] = df3[cols_to_norm].apply(lambda x: np.log(x))
     
    #standaryzacja
    df3[cols_to_norm] = df3[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return(df3)

In [None]:
#lista_path = [r'c:\doktorat\czemp6\kondracki_mezo',
#            r'c:\doktorat\czemp6\kondracki_mezo_makro',
#            r'c:\doktorat\czemp6\powiaty',
#            r'c:\doktorat\czemp6\kwadraty_losowe',
#            r'c:\doktorat\czemp6\kwadraty_10km']

lista_path = []

lista_nazw = ['geo_roznorodne', 'geo_jednorodne', 'powiaty', 'kwadraty_losowe', 'kwadraty_10km']
cols_to_norm = ['dist_rds', 'farm','nach', 'pop_dens', 'prec',  'temp', 'tourism', 'tpi']

#lista_nazw = ['kwadraty_25km_los']
new_df = pd.DataFrame(columns = ['model', 'r2', 'r2_test', 'auc_kurwa', 'dist_rds', 'farm','nach', 'pop_dens',
                                     'prec',  'temp', 'tourism', 'tpi', 'regiony'])
indeks = 0
for path in lista_path:
    nazwa_modeli = lista_nazw[indeks]
    indeks = indeks + 1
    extension = 'csv'
    os.chdir(path)
    result = [i for i in glob.glob('*.{}'.format(extension))]
    for i in result:
        try:
            # dodac warunek, zeby model byl wiekszy, niz..
            nazwa_modelu = str(i[:-4])
            df = pd.read_csv(i)
            
            df_przeczyszczony = czyszczenie(df, cols_to_norm)
            liczenie_regresji = regresja(df_przeczyszczony, nazwa_modeli)
            df_coef = liczenie_regresji[0]
            r2_wynik = liczenie_regresji[1]
            auc_kurwa = liczenie_regresji[2]
            r2_wynik_test = liczenie_regresji[4]
            lista=[nazwa_modelu]
            
            for j in df_coef.itertuples():
                lista.append(float(j[2]))

            new_df = new_df.append({'model':lista[0], 'r2': r2_wynik,  'r2_test': r2_wynik_test, 'auc_kurwa': auc_kurwa,  'dist_rds':lista[1],'farm' :lista[2],'nach':lista[3],
                                     'pop_dens':lista[4], 'prec':lista[5],  'temp':lista[6],
                                     'tourism':lista[7], 'tpi':lista[8], 'regiony': nazwa_modeli}, ignore_index=True)
        except:
            print("wystapil blad")
            
path = r'c:\doktorat\czemp6\regresje_zbiorcze'
wyniki_regresji = os.path.join(path, "wyniki_ROC_stand.csv")
new_df.to_csv(wyniki_regresji)
new_df.head(4)
new_df['r2'].mean()