In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import PowerTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
%matplotlib inline


Reading csv file in Pandas dataframe

In [2]:
def read_csv(filename):
    data = pd.read_csv(filename)
    return data

Standardizing columns

In [3]:
def col_standard(columns):
    return [val.lower().replace(' ','_') for val in columns]

Data splitting into categorical and numerical 

In [4]:
def data_split(data):
    data1 = pd.DataFrame()
    data2 = pd.DataFrame()
    data1 = data.select_dtypes(include = np.number)
    data2 = data.select_dtypes(include = np.object)
    return data1,data2

Null value counts

In [5]:
def null_counts(data):
    print(data.isnull().sum())

Droping null values. Call this function only when we have a small proportion of null values in certain column 

In [6]:
def drop_null(data):
    data = data.dropna()
    print(data.isna().sum())
    return data

Droping column function

In [7]:
def col_drop(data,columnName):
    for val in columnName:
        data = data.drop(val,axis = 1)
    return data
    

In [8]:
def display():
    print('csv file reading--> read_csv(filename)')
    print('column name standardization --> col_standard(columns)')
    print('splitting data into categorical and numerical --> data_split(data)')
    print('Null value counts for each column--> null_counts(data)')
    print('Droping null values. Call this function only needed --> drop_null(data)')
    print('column Droping --> col_drop(data,columnName)')
    

original csv filename and store the data without header name into csv

correlation and heat map

In [9]:
def my_heatMap(data):
    correlations_matrix = data.corr()
    mask = np.zeros_like(correlations_matrix)
    mask[np.triu_indices_from(mask)] = True
    fig, ax = plt.subplots(figsize=(25, 10))
    ax = sns.heatmap(correlations_matrix, mask=mask, annot=True)
    plt.show()
    

In [10]:
def VIF(data):
    flag = True
    threshold = 3
    print(threshold)
    while flag is True:
    #print(data_corr.head())
        flag = False
        values = [variance_inflation_factor(np.array(data), i) for i in np.arange(data.shape[1])]
    #print(values)
        #print(min(values))
        
        if max(values)> threshold:
                col_index = values.index(max(values))
                column_name = data.columns[col_index]
                data = data.drop([column_name], axis=1)
                flag = True
    return data

In [11]:
y = lambda x: np.log(x) if(x>0) else x

In [12]:
def distribution_plot(data,column):
    power = PowerTransformer(method='yeo-johnson', standardize=True)
    data_trans = power.fit_transform(data[column].values.reshape(-1,1))
    data_trans = [item[0] for item in data_trans]
    data_log = data[column].apply(lambda x: np.log(x) if(x>0) else x)  ## Fix this function to not get infs 
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (15,4))
    sns.distplot(data[column], ax=ax1)
    ax1.set_title('Actual Distribution')
    sns.distplot(data_log, ax=ax2)
    ax2.set_title('Log Transformation')
    sns.distplot(data_trans, ax=ax3)
    ax3.set_title('Yeo-Johnson Transformation')
    plt.show()


In [13]:
def models_automation(models, X_train, y_train, X_test, y_test):
    lis = []
    lis1 = []
    for i,model in enumerate(models):
        lis.append([])
        lis1.append([])
        model.fit(X_train, y_train)
        print(f"{str(model)}: Test -> {model.score(X_test, y_test)}")
        lis[i].append(model.score(X_test, y_test))
        y_predict = model.predict(X_test)
        plt.scatter(np.arange(0,len(X_test),1), y_test, color = 'black', label = 'actual_prices')
        plt.plot(np.arange(0,len(X_test),1), y_predict, color = 'blue', linewidth = 3, label = 'predicted_prices')
        plt.title(str(model)+' Test set plot')
        plt.xlabel('Actual price')
        plt.ylabel('Prices')
        plt.legend()
        plt.xticks(())
        plt.yticks(())
        plt.show()
        lis1[i].append(list(y_predict))
        lis[i].append(mean_absolute_error(y_test,y_predict))
        lis[i].append(mean_squared_error(y_test,y_predict))
        lis[i].append(mean_squared_error(y_test, y_predict, squared=False))
    return lis,lis1

In [14]:
def my_difference(categorical_data,categorical_data_hot):
    categorical_data_ordinal = []
    for item in categorical_data:
        if item not in categorical_data_hot:
            categorical_data_ordinal.append(item)
    return categorical_data_ordinal

In [15]:
def my_scores(lis):
    results = pd.DataFrame(lis,columns = ['R_square', 'MAE','MSE','RMSE'])
    results.insert(0,'Models',['Linear Regression', 'KNneighborsRegressor', 'RandomForestRegressor'])
    results = results.T
    return results

In [16]:
def plot_errors(y, y_pred):
    
    
 #   fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15,4))
  #  plt.subplots_adjust(left=None, bottom=5, right=None, top=6, wspace=None, hspace=None)
    fig, ax = plt.subplots()
    sns.set(color_codes=True)
    sns.set(rc={'figure.figsize':(7, 7)})
    plt.ylabel("Predicted 'price'")
    plt.title("Test set: Predictions against real values")
    X_plot = np.linspace(10, 16, 16)
    Y_plot = X_plot
    sns.regplot(x=y, y=y_pred, fit_reg=False, ax=ax,scatter_kws={"color": "blue", "s":1})
    plt.plot(X_plot, Y_plot, color='r',linewidth=4.)
    plt.show()
    
    #sns.displot(y-y_pred, kde=True, ax=ax2)
    #plt.title("Train set: Distribution of residuals")
    #plt.show()
