In [6]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import os
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt

In [7]:
#REGRESJA LOGISTYCZNA
def regresja(df, nazwa_modeli):
    # usuwamy ownership bo bylo mieszajace
    X  = df.drop(['las01', 'owner', 'Unnamed: 0'], axis=1) 
    Y = df.las01

    # splasczenie zeminnej zaleznej bo musi byc splaszczona
    y = np.ravel(Y)
    
    # podzial na test i train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
    
    # podzial na test i train
    log_model = LogisticRegression()
    log_model.fit(X_train,y_train)
    r2 = log_model.score(X,Y)
    r2_test = log_model.score(X_test,y_test)
    print(r2, r2_test)
    
    # Predict the labels of the test set: y_pred
    y_pred = log_model.predict(X_test)
    y_pred_prob = log_model.predict_proba(X_test)[:,1]
    
    # Compute and print AUC score
    auc_kurwa = roc_auc_score(y_test, y_pred_prob)
    print("AUC: {}".format(auc_kurwa), nazwa_modeli)
    
    # Generate ROC curve values: fpr, tpr, thresholds
     #fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

    # Plot ROC curve
    # plt.plot([0, 1], [0, 1], 'k--')
    # plt.plot(fpr, tpr)
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('ROC Curve')
    # plt.show()

    # Compute and print the confusion matrix and classification report
    #print(confusion_matrix(y_test, y_pred))
    #print(classification_report(y_test, y_pred))
    #print("r2", r2)
    
    # standardowy output regresji
    coef_df = DataFrame(zip(X.columns, np.transpose(log_model.coef_)))
    return(coef_df, r2, auc_kurwa, nazwa_modeli, r2_test)

In [8]:
def czyszczenie(df, cols_to_norm):
    df2 = df.replace(-9999, np.NaN)
    df3 = df2.dropna()
     
    ##normalizacja - nie jest w stanie zrobic normalizacji
    df3[cols_to_norm] = df3[cols_to_norm].apply(lambda x: np.log(x))
     
    #standaryzacja, zeby zmienne byly porownywalne miedzy soba
    df3[cols_to_norm] = df3[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return(df3)

In [15]:
lista_path = [r'c:\doktorat\czemp6\kondracki_mezo',
            r'c:\doktorat\czemp6\kondracki_mezo_makro',
            r'c:\doktorat\czemp6\powiaty',
            r'c:\doktorat\czemp6\kwadraty_losowe',
            r'c:\doktorat\czemp6\kwadraty_10km']

#lista_path = [r'c:\doktorat\czemp5\konglomeraty_250\kwadraty_losowe_25km']

lista_nazw = ['geo_roznorodne', 'geo_jednorodne', 'powiaty', 'kwadraty_losowe', 'kwadraty_10km']
cols_to_norm = ['dist_rds', 'farm','nach', 'pop_dens', 'prec',  'temp', 'tourism', 'tpi']

#lista_nazw = ['kwadraty_25km_los']

new_df = pd.DataFrame(columns = ['model', 'r2', 'r2_test', 'auc_kurwa', 'dist_rds', 'farm','nach', 'pop_dens',
                                     'prec',  'temp', 'tourism', 'tpi', 'regiony'])
indeks = 0
for path in lista_path:
    nazwa_modeli = lista_nazw[indeks]
    indeks = indeks + 1
    extension = 'csv'
    os.chdir(path)
    result = [i for i in glob.glob('*.{}'.format(extension))]
    for i in result:
        try:
            # dodac warunek, zeby model byl wiekszy, niz..
            nazwa_modelu = str(i[:-4])
            df = pd.read_csv(i)
            
            df_przeczyszczony = czyszczenie(df, cols_to_norm)
            liczenie_regresji = regresja(df_przeczyszczony, nazwa_modeli)
            df_coef = liczenie_regresji[0]
            r2_wynik = liczenie_regresji[1]
            auc_kurwa = liczenie_regresji[2]
            r2_wynik_test = liczenie_regresji[4]
            lista=[nazwa_modelu]
            
            for j in df_coef.itertuples():
                lista.append(float(j[2]))

            new_df = new_df.append({'model':lista[0], 'r2': r2_wynik,  'r2_test': r2_wynik_test, 'auc_kurwa': auc_kurwa,  'dist_rds':lista[1],'farm' :lista[2],'nach':lista[3],
                                     'pop_dens':lista[4], 'prec':lista[5],  'temp':lista[6],
                                     'tourism':lista[7], 'tpi':lista[8], 'regiony': nazwa_modeli}, ignore_index=True)
        except:
            print("das")
            
path = r'c:\doktorat\czemp6\regresje_zbiorcze'
wyniki_regresji = os.path.join(path, "wyniki_ROC_stand.csv")
new_df.to_csv(wyniki_regresji)
new_df.head(4)
new_df['r2'].mean()

(0.8117650901489417, 0.8163309744148067)
('AUC: 0.863276336332', 'geo_roznorodne')
(0.7473030939125673, 0.7515818934047214)
('AUC: 0.806772303913', 'geo_roznorodne')
(0.7761994949494949, 0.7853761178327197)
('AUC: 0.860243816704', 'geo_roznorodne')
(0.7994041563726203, 0.7982366904035266)
('AUC: 0.878974384066', 'geo_roznorodne')
(0.7234805699225175, 0.7277230992800053)
('AUC: 0.800317299893', 'geo_roznorodne')
(0.7447529603919968, 0.7453382332925004)
('AUC: 0.830248500765', 'geo_roznorodne')
(0.7412638907484268, 0.7392614080107107)
('AUC: 0.822312300098', 'geo_roznorodne')
(0.7637790098836318, 0.7595325054784514)
('AUC: 0.771682841476', 'geo_roznorodne')
(0.7531269333189875, 0.7478034785727093)
('AUC: 0.815587516722', 'geo_roznorodne')
(0.7115487914055506, 0.7082228116710876)
('AUC: 0.745318484615', 'geo_roznorodne')
(0.7273087245369352, 0.7209268221146796)
('AUC: 0.717766911916', 'geo_roznorodne')
(0.84573553770666, 0.8408333333333333)
('AUC: 0.90730145458', 'geo_roznorodne')
(0.7311

(0.7883238794667734, 0.7897318664571129)
('AUC: 0.854655495351', 'powiaty')
(0.8124928862482642, 0.8118977160634343)
('AUC: 0.883465573621', 'powiaty')
(0.7760526413071606, 0.7783254266527017)
('AUC: 0.863820365839', 'powiaty')
(0.7760429782590448, 0.7750419697817571)
('AUC: 0.851745102874', 'powiaty')
(0.7573678329492283, 0.7565617179770704)
('AUC: 0.814379839956', 'powiaty')
(0.762375299911605, 0.7598126907292434)
('AUC: 0.829152081613', 'powiaty')
(0.7452668784899134, 0.749388982891521)
('AUC: 0.816949228991', 'powiaty')
(0.7539039514778436, 0.7539730436531885)
('AUC: 0.824070702569', 'powiaty')
(0.8089958821666139, 0.810881311719769)
('AUC: 0.893461103576', 'powiaty')
(0.7447813729365482, 0.7452161365204844)
('AUC: 0.806492450818', 'powiaty')
(0.7500148561920609, 0.7495295632366049)
('AUC: 0.805490174757', 'powiaty')
(0.7621539755994952, 0.7641911599730761)
('AUC: 0.842874544824', 'powiaty')
(0.7410535324189808, 0.7399594625028151)
('AUC: 0.805426266913', 'powiaty')
(0.797549387346

In [15]:

# df1, df2
# dodac pole - regiony + model
# polaczyc, zapisac
import pandas as pd
import os
df2 = pd.read_csv(r'c:\doktorat\czemp6\regresje_zbiorcze\konglomeraty_parametry_zbiorcze_v5.csv')
new_df['lacznik'] = new_df['regiony'].astype(str) + new_df['model'].astype(str)
df2['lacznik'] =df2['regiony'].astype(str) + df2['model'].astype(str)
df_merged = pd.merge(new_df, df2, on="lacznik", how = "left")

path = r'c:\doktorat\czemp6\regresje_zbiorcze'
modele_parametry = os.path.join(path, "modele_zbiorcze_wyniki_stand_param_roc_v6.csv")

df_merged.to_csv(modele_parametry)
df_merged.head()

tabelka = new_df.groupby('regiony')['auc_kurwa'].mean()
tabela_df = pd.DataFrame(tabelka)
tabela_df['regiony2'] = pd.to_numeric(tabela_df.index)
tabela_df.sort_values('regiony2', inplace=True)
_ = tabela_df.plot('regiony2', 'r2')
_ = plt.xlabel('wielkosc mau')
_= plt.ylabel('auc_kurwa')
tabela_df

ValueError: Unable to parse string "kondracki" at position 0

Unnamed: 0_level_0,r2
regiony,Unnamed: 1_level_1
10400,0.795063
1300,0.825757
15900,0.79043
20800,0.789409
2600,0.81268
5200,0.801977
7800,0.81426
