### Import required packages

In [2]:
import numpy as np
import pandas as pd
import os
import sys
from os import path
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import itertools  
warnings.filterwarnings("ignore")
plt.style.use("fivethirtyeight")

### Open the corresponding dataset based on the file encoding type and extension

In [3]:
#Open corresponding file
filename = "housing"
file_format = filename+'.csv'
filename = file_format
encoding = ['utf8','latin1','iso-8859-1','cp1252']
flag = False
encoding_list = len(encoding)
if file_format.endswith('.html'):
    dfs = pd.read_html(file_format)
    df  = pd.concat(dfs)
    df.to_csv(filename+".csv",index=False)
if file_format.endswith('.json'):
    df = pd.read_json(filename)
if 'parsed.csv' in filename:
    df = pd.read_csv(file_format, index_col=0 )
    flag = True
if flag != True:
#Try all the possible encoding schemes to see which one applies to the dataset
    for encoding_type in range(0,encoding_list-1):
        try:
            if file_format.endswith('.csv'):
                df = pd.read_csv(file_format, index_col=0 , encoding = encoding[encoding_type])
                break
        except Exception:
            pass

### Display the top 5 contents of the dataset

In [4]:
df.head()

Unnamed: 0_level_0,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


### Data cleaning methods

In [5]:
class cleanData:
    
    #Initialize the dataset that is used to apply the relevant cleaning opeea
    def __init__(self, df): 
        self.data = df 

    def cleanColumnData(self):
        #convert all column names into lowercase for uniformity and remove whitespaces
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
        df.loc[:, df.dtypes == object].replace(np.nan,'',regex = True)

### Creating an instance for the data clean class

In [6]:
#Creating an instance for the data clean class 
cleanData_object = cleanData(df)
cleanData_object.cleanColumnData()

### Method to select the range of rows and columns used to plot

In [7]:
def selectRows(df):
    try:
        choice = input("Do you want to plot values of the entire dataset? Type Yes or No(To select a subset of data)\n")
        choice.lower()
        if choice == 'yes' or choice == 'y':
            df1 = df
            return df1
        elif choice == 'No' or choice == 'n':
            totalValues = df.shape
            choice = int(input("Select the range of values in the dataset to be plotted: \
                               \n1)Top 100 values \
                               \n2)50% of the dataset \
                               \n3)25% of the dataset \
                               \n4)Random 100 values in the dataset \
                               \n5)Random 50% data of the dataset\n"))
            if choice == 1:
                return df.loc[0:100]
            elif choice == 2:
                total =  totalValues[0]/2
                return df.loc[0:total]
            elif choice == 3:
                total = totalValues[0]/4
                return df.loc[0:total]
            elif choice == 4:
                return df.sample(n=100)
            elif choice == 5:
                total = totalValues[0]/2
                return df.sample(n=total)
            else:
                print("Enter a valid option")
        else:
            print("Please enter a valid option")
    except ValueError as e:
        print("Value error {} occured".format(e))

### Returns the columns that is selected by the user

In [8]:
#The below method returns the column names and indices .It then returns the user selected index for visualizing.
def returnColumns(df,index):
    try:
        NumberOfIndex = index
        IndexToBePlotted = []
        print("The column name and the index are listed below:\n")
        for index,column in enumerate(df.loc[:, df.dtypes == object]):
            print(index,column)
        print("\n")
        for i in range(0, NumberOfIndex):
            Number = int(input("\nEnter the column index \n"))    
            IndexToBePlotted.append(Number) 
        columns_to_plot = [df.loc[:, df.dtypes == object].columns[i] for i in IndexToBePlotted]
        return columns_to_plot
    except ValueError as e:
        print("Value error {} occured".format(e))

### Returns the columns selected by the user which is used as a Hue paramater


In [9]:
#The below method returns the categorical column names and indices after the user prompted for a hue parameter.It then returns the user selected index.
def returnHueColumns_CategoricalData(df):
    try:
        NumberOfIndex = 1
        IndexToBePlotted = []
        for i in range(0, NumberOfIndex):
            Number = input("\nEnter a column index for hue parameter or press enter to exit \n")
            if Number.isdigit():
                IndexToBePlotted.append(int(Number)) 
                hueColumns_to_plot = [df.loc[:, df.dtypes == object].columns[i] for i in IndexToBePlotted]
                return hueColumns_to_plot
            elif Number == '':
                pass
            else:
                print("Enter a valid option") 
    except ValueError as e:
        print("Value error {} occured".format(e))

In [10]:
# #The below method returns both the categorical and numerical column names and indices after the user prompted for a hue parameter.It then returns the user selected index.
def returnHueColumns_MixedData(df):
    try:
        NumberOfIndex = 1
        IndexToBePlotted = []
        for i in range(0, NumberOfIndex):
            Number = input("\nEnter a column index for hue parameter or press enter to exit \n")
            if Number.isdigit():
                Number = int(Number)
            else:
                Number = str(Number)
            if Number != '':
                IndexToBePlotted.append(Number) 
                hueColumns_to_plot = [df.loc[:].columns[i] for i in IndexToBePlotted]
                return hueColumns_to_plot
            elif Number == '':
                pass
            else:
                print("Enter a valid option")
    except ValueError as e:
        print("Value error {} occured".format(e))

### Method to visualize the categorical data based on user specified plot type

In [11]:
# The following method visualizes data based on categorical data and user preferred plot and hue paramters 
def categorical_visualization(df):
    try:
        choice = input("Do you want to visualize the columns based on categorical data? Type Yes or no\n")
        choice.lower()
        if choice == 'yes' or choice == 'y':
            plotType = int(input("Choose the type of visualization: \
                                 \n1: Count plot \
                                 \n2: ScatterPlot\n"))
            if plotType == 1:
                NumberOfIndex = 1 
            elif plotType == 2:
                NumberOfIndex = 2
            else:
                print("Enter a valid option") 
            #returnColumns() method returns the numerical column names and indixes and prompts the user to enter the column index
            #Invoke 'selectRows' method to select the range of values to plot
            #returnHueColumns() used in the below lines prompts the user for hue paramter and the respective column index 
            if plotType == 1 :      
                columns_to_plot = returnColumns(df,NumberOfIndex)
                hueColumns_to_plot = returnHueColumns_CategoricalData(df)
                for index in range(0 , len(columns_to_plot)):
                    sns.set(style="whitegrid")
                    plt.figure(figsize=(10,8))
                    plt.title("Count plot of the column "+columns_to_plot[index])
                    df1 = selectRows(df)
                    if hueColumns_to_plot == None:
                        sns.countplot(x=columns_to_plot[index] ,data = df1)
                    else:
                        sns.countplot(x=columns_to_plot[index], hue=hueColumns_to_plot[0] ,data = df1)
            elif plotType == 2:
                columns_to_plot = returnColumns(df,NumberOfIndex)
                hueColumns_to_plot = returnHueColumns_CategoricalData(df)
                for index in range(0 , len(columns_to_plot)-1):
                    sns.set(style='darkgrid')
                    plt.figure(figsize=(20,10))
                    plt.title("Scatter plot of the columns "+columns_to_plot[index]+" "+"and"+" "+columns_to_plot[index+1])
                    df1 = selectRows(df)
                    if hueColumns_to_plot == None:
                        sns.scatterplot(x=columns_to_plot[index],y=columns_to_plot[index+1], data=df1)
                    else:
                        sns.scatterplot(x=columns_to_plot[index],y=columns_to_plot[index+1], kind='swarm', hue=hueColumns_to_plot[0], data=df1)
            else:
                print("Please enter a valid choice")
        elif choice == 'no' or choice == 'n':
            pass
        else:
            print("Please enter a valid choice")
    except ValueError as e:
        print("Value error {} occured".format(e))

### Method to visualize both the categorical and numeric data based on user specified data

In [12]:
# Lists the column index alongside the type of the column whether its numerical or categorical
def columnType(df):
    try:
        column_type = []
        for i in range(0,len(df.columns)):
            if df.dtypes[i] == object:
                col = df.dtypes[i] = "categorical column"
            if df.dtypes[i] != object:
                col = df.dtypes[i] = "numerical column"
            column_type.append(col)
        df1 = pd.DataFrame(df.columns , columns = ["Columns"] )
        df2 = pd.DataFrame(column_type, columns=["DataType"])
        res = pd.concat([df1, df2] , axis=1 , join='inner')
        return res
    except ValueError as e:
        print("Value error {} occured".format(e))

### Method to visualize both numerical and categorical data based on user specified plot type

In [13]:
# The following method visualizes data based on both numerical and categorical data and user preferred plot and hue paramters.
# The x-axis takes numerical column as a paramter and the y-axis takes a categorical column
def mixedColumn_visualization(df):
    try:
        choice = input("Do you want to visualize the columns based on both numerical and categorical data? Type Yes or no\n")
        choice.lower()
        if choice == 'yes' or choice == 'y':
            plotType = int(input("Choose the type of visualization: \
                                 \n1: Box plot \
                                 \n2: Cat Plot \
                                 \n3: Point plot\n"))
            if plotType == 1 or plotType == 2 or plotType == 3:
                NumberOfIndex = 2 
            else:
                pass 
            IndexToBePlotted = []
            print(columnType(df))
            print("\n")
            print("Categorical box plot takes two indices out of which there must be atleast one numerical column\n")
            print("The numerical data is plotted in the x-axis and categorical data on the y-axis")
            try:
                for i in range(0, NumberOfIndex):
                    if i == 0:
                        Number = int(input("\nEnter the numerical column index:\n"))
                        if Number < len(df.columns):
                            IndexToBePlotted.append(Number) 
                        else:
                            print("Enter a valid index")
                    if i == 1:
                        Number = int(input("\nEnter the categorical column index:\n")) 
                        if Number < len(df.columns):
                            IndexToBePlotted.append(Number) 
                        else:
                            print("Enter a valid index")
            except ValueError as e:
                print("Value error {} occured".format(e))
            #returnColumns() method returns the numerical column names and indixes and prompts the user to enter the column index
            #Invoke 'selectRows' method to select the range of values to plot
            #returnHueColumns() used in the below lines prompts the user for hue paramter and the respective column index 
            if plotType == 1:
                columns_to_plot = [df.columns[i] for i in IndexToBePlotted]
                hueColumns_to_plot = returnHueColumns_MixedData(df)
                for index in range(0 , len(columns_to_plot)-1):
                    sns.set(style='darkgrid')
                    plt.figure(figsize=(20,10))
                    plt.title("Box plot of the columns "+columns_to_plot[index]+" "+"and"+" "+columns_to_plot[index+1])
                    df1 = selectRows(df)
                    if hueColumns_to_plot == None:
                        sns.boxplot(x=columns_to_plot[index], y=columns_to_plot[index+1] , data=df1)
                    else:
                        sns.boxplot(x=columns_to_plot[index], y=columns_to_plot[index+1] ,hue=hueColumns_to_plot[0], data=df1)
            if plotType == 2:
                columns_to_plot = [df.columns[i] for i in IndexToBePlotted]
                hueColumns_to_plot = returnHueColumns_MixedData(df)
                for index in range(0 , len(columns_to_plot)-1):
                    sns.set(style='darkgrid')
                    plt.figure(figsize=(20,10))
                    df1 = selectRows(df)
                    if hueColumns_to_plot == None:
                        sns.catplot(x=columns_to_plot[index], y=columns_to_plot[index+1], data=df1, kind='swarm', aspect=2)
                    else:
                        sns.catplot(x=columns_to_plot[index], y=columns_to_plot[index+1], hue=hueColumns_to_plot[0], data=df1, kind='swarm', aspect=2)
            if plotType == 3:
                columns_to_plot = [df.columns[i] for i in IndexToBePlotted]
                hueColumns_to_plot = returnHueColumns_MixedData(df)
                for index in range(0 , len(columns_to_plot)-1):
                    sns.set(style='darkgrid')
                    plt.figure(figsize=(20,10))
                    df1 = selectRows(df)
                    if hueColumns_to_plot == None:
                        sns.catplot(x=columns_to_plot[index], y=columns_to_plot[index+1], data=df1, kind='point', aspect=2)
                    else:
                        sns.catplot(x=columns_to_plot[index], y=columns_to_plot[index+1],hue=hueColumns_to_plot[0], data=df1, kind='point', aspect=2)
        elif choice == 'no' or choice == 'n':
            pass
        else:
            print("Please enter a valid choice")
    except ValueError as e:
        print("Value error {} occured".format(e))

### Invoke the categorical visualization method

In [14]:
categorical_visualization(df)

Do you want to visualize the columns based on categorical data? Type Yes or no
n


#### Invoke the categorical&Numerical visualization method

In [15]:
mixedColumn_visualization(df)

Do you want to visualize the columns based on both numerical and categorical data? Type Yes or no
n
