In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Caregando o arquivo
df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
df.head()

In [None]:
# Selecionando os atributos para a exploração
df.shape[0]
df['LotFrontage'][0]

mydf = df[['LotArea','Street','BldgType','HouseStyle','OverallQual', 'OverallCond','YearBuilt','TotalBsmtSF','BedroomAbvGr','KitchenAbvGr','SalePrice']]
mydf.head()

In [None]:
# Descrição do conjunto de dados selecionado, nome: mydf
mydf.info()

In [None]:
# Fazendo a conversão dos atributos qualitativos 
print(mydf['Street'].value_counts())
print(mydf['BldgType'].value_counts())
print(mydf['HouseStyle'].value_counts())
cleanup_nums = {"Street": {"Grvl":0, "Pave":1},
                 "BldgType": {"1Fam":0, "TwnhsE": 1, "Duplex":2, "Twnhs":3, "2fmCon":4},
                 "HouseStyle": {"1Story":0, "2Story":2, "1.5Fin":3, "SLvl":4, "SFoyer":5, "1.5Unf":6, "2.5Unf":7, "2.5Fin":8}}
mydf.replace(cleanup_nums, inplace=True)
print(mydf['Street'].value_counts())
print(mydf['BldgType'].value_counts())
print(mydf['HouseStyle'].value_counts())

In [None]:
# Calculando as medidas de localização e dispersão
stats = np.zeros([5,10])
counter = 0
with open('stats.csv', 'w') as file:
    for column in mydf:
        stats[0,counter] = mydf[column].mean()
        stats[1,counter] = mydf[column].median()
        stats[2,counter] = mydf[column].mode()
        stats[3,counter] = mydf[column].std()
        stats[4,counter] = mydf[column].var()
        #stats[3,counter] = mydf[column].skew()
        #stats[4,counter] = mydf[column].kurt()
        for i in range(0,5):
            file.write( str(round(stats[i,counter],3)) + ","  )
        counter += 1
        file.write('\n')

In [None]:
mydf.describe()

In [None]:
# Calculando as medidas de distribuição: Momento central para K=1,2,3,4; momento original para k=1,2; momento padronizado para k=1,2,3,4
import math 

# Construimos intervalos y calculamos la frequência por cada intervalo
k    = math.floor(1 + ((math.log(mydf['LotArea'].shape[0],2))))
print('ElementosK')
print(k)
cols = ['amplitud']
amplitud = pd.DataFrame(index = mydf.columns.values, columns =cols)
amplitud = amplitud.fillna(0)
for column in mydf:
    hist, bin_edges = np.histogram(mydf[column])
    amplitud.loc[column] = bin_edges[1] - bin_edges[0]    
print(amplitud)

def obtenerAmplitudCorrecta(column, data):
    val_min = math.floor(mydf[column].min())     
    amp = amplitud.at[column,'amplitud']        
    return math.floor((data-val_min) / amp)     

freq    = np.array(np.zeros((10, k)), dtype='int64')
idx_row = 0
for column in mydf:
    values = np.zeros(k)
    for i in range(0, mydf[column].shape[0]):
        element = mydf[column][i]
        amp_cor = obtenerAmplitudCorrecta(column, element)
        freq[idx_row][amp_cor] += 1
    idx_row +=1
        
print('FRECUENCIA')
frecuencia = pd.DataFrame(freq)
frecuencia = frecuencia.set_index(amplitud.index.values)
print(frecuencia)
print('FIN FRECUENCIA')

#Momento original
cols_original     = ['k1']
momentos_original = pd.DataFrame(index = mydf.columns.values, columns=cols_original)
momentos_original = momentos_original.fillna(0)

cols_medio     = ['k1', 'k2', 'k3', 'k4']
momentos_medio = pd.DataFrame(index = mydf.columns.values, columns=cols_medio)
momentos_medio = momentos_medio.fillna(0)
media          = mydf.mean()


cols_padronizado     = ['k1', 'k2', 'k3', 'k4']
momentos_padronizado = pd.DataFrame(index = mydf.columns.values, columns=cols_padronizado)
momentos_padronizado = momentos_padronizado.fillna(0)
varianza             = mydf.var()
for column in mydf:
    temp_original    = 0
    temp_medio_k1    = 0
    temp_medio_k2    = 0
    temp_medio_k3    = 0
    temp_medio_k4    = 0
    temp_padronizado_k1 = 0
    temp_padronizado_k2 = 0
    temp_padronizado_k3 = 0
    temp_padronizado_k4 = 0
    for i in range(0, mydf[column].shape[0]):
        element = mydf[column][i]
        amplitud
        amp_cor = obtenerAmplitudCorrecta(column, element)
        amp     = amplitud.at[column,'amplitud']
        minimo  =  mydf[column].min()
        min_amp = minimo + (amp * amp_cor)
        max_amp = minimo + (amp * (amp_cor + 1))
        elemnt  = (min_amp + max_amp) / 2
        
        frq_temp= frecuencia.at[column, amp_cor]
        
        # Momento Original                
        temp_original += frq_temp * element
        
        #Momento Central
        aux = element - media[column]
        temp_medio_k1 += aux ** 1
        temp_medio_k2 += aux ** 2
        temp_medio_k3 += aux ** 3
        temp_medio_k4 += aux ** 4
        
        #Momento Padronizado
        # Los momentos k = 1 , k = 2 y k = 4 el calculo inicial es parecido al del momento medio
        temp_padronizado_k3 += (aux ** 3) * frq_temp
        
    
    # Agrupando momento original
    momentos_original.loc[column] = temp_original
    
    # Agrupando momento medio
    temp_medio_k1 /= (mydf.shape[0] - 1)
    temp_medio_k2 /= (mydf.shape[0] - 1)
    temp_medio_k3 /= (mydf.shape[0] - 1)
    temp_medio_k4 /= (mydf.shape[0] - 1)    
    momentos_medio.loc[column] = pd.Series({'k1':temp_medio_k1, 'k2':temp_medio_k2, 'k3':temp_medio_k3, 'k4':temp_medio_k4})
    
    #Agrupando momento padronizado
    temp_padronizado_k1 = temp_medio_k1 / (varianza[column])
    temp_padronizado_k2 = temp_medio_k2 / (varianza[column])
    temp_padronizado_k3 /= temp_padronizado_k3 / (varianza[column])
    temp_padronizado_k4 = temp_medio_k4 / (varianza[column] ** 2)
    momentos_padronizado.loc[column] = pd.Series({'k1':temp_padronizado_k1, 'k2':temp_padronizado_k2, 'k3':temp_padronizado_k3, 'k4':temp_padronizado_k4})

print('******* MOMENTO ORIGINAL *******')
print(momentos_original)
print('******* MOMENTO CENTRAL *******')
print(momentos_medio)
print('******* MOMENTO PADRONIZADO *******')
print(momentos_padronizado)

In [None]:
import os
files = [f for f in os.listdir('./') if os.path.isfile(f)]
print(files)

In [None]:
# Calculando a matriz de covariância e a correlação entre os atributos
mydf.cov()
mydf.corr(method='pearson')

In [None]:
# Gerando o scatter plot
import numpy as np
import matplotlib.pyplot as plt

pd.plotting.scatter_matrix(mydf, alpha=0.2, figsize=(10, 10))
plt.axis('off')
plt.savefig('scatter.png')
plt.show()


In [None]:
# Particularmente greando o scatter plot de dois atributos
plt.scatter(df.YearBuilt, df.LotArea)
plt.xlabel("YearBuilt", fontsize = 15)
plt.ylabel("LotArea", fontsize = 15)


In [None]:
# Particularmente greando o scatter plot de dois atributos
plt.scatter(df.KitchenAbvGr, df.LotArea)
plt.xlabel("KitchenAbvGr", fontsize = 15)
plt.ylabel("LotArea", fontsize = 15)


In [None]:
# Gerando o histograma
counter = 1
plt.figure(figsize = (7,7))
for column in mydf:
    plt.subplot(4,3,int(counter))
    plt.title(column)
    count, division = np.histogram(mydf[column])
    plt.subplots_adjust(hspace=0.5)
    plt.hist(count)
    counter += 1
plt.show()
plt.savefig('hist.png')

In [None]:
# Gerando o boxplot
counter = 1
plt.figure(figsize = (7,7))
for column in mydf:
    plt.subplot(4,3,int(counter))
    plt.title(column)
    plt.subplots_adjust(hspace=0.5)
    mydf.boxplot(column, grid=False)
    counter += 1
plt.show()
plt.savefig('boxplot.png')

In [None]:
mydf

In [None]:
####PROBARRRR
from sklearn.neural_network import MLPRegressor
import random

size_col = 5
N_test = 10
size_subset = int(mydf.shape[0]/10)
print('size subset ' + str(size_subset))
x_subset = np.zeros([size_subset,size_col+1])

mydf_data = mydf.get_values()

for n_test in range(0,N_test):
    print( n_test )
    x_row = random.sample(range(1, mydf.shape[0]), size_subset)
    x_row = np.sort(x_row)
#     print(x_row)

#     print( mydf.shape )
    x_col = random.sample(range(0, mydf_data.shape[1]-1), size_col)
    x_col = np.sort(x_col)
#     print(x_col)
    
    c = 0
    
    for idj in range(0, mydf_data.shape[1]-1):
#         print('idj ' + str(idj) + ' ' + str(x_col[c]))
        for idx_subset in range(0,size_subset):
            if( idj == x_col[c] ):
                x_subset[idx_subset,c] = mydf_data[x_row[idx_subset],x_col[c]]
#         print(c)
        if(idj == x_col[c]):
            c += 1
        if( c == size_col ):
            break;
        
    
    for idx_subset in range(0,size_subset):
        x_subset[idx_subset,x_subset.shape[1]-1] = mydf_data[x_row[idx_subset],mydf_data.shape[1]-1]

#     print(x_subset)

    x_st = x_subset[:,0:x_subset.shape[1]-1]
    y_st = x_subset[:,x_subset.shape[1]-1]
#     print(x_subset[:,0:x_subset.shape[1]-1])
#     print(x_subset[:,x_subset.shape[1]-1])
        
    clf = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001).fit(x_st,y_st)
    # print(clf.predict(x_t))
    print(clf.score(x_st,y_st))
    print( np.sum((y_st - clf.predict(x_st))*(y_st - clf.predict(x_st))) )


In [None]:
random.randint(2,4)