In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import style
style.use('ggplot')
import re

In [2]:
def dataset_info(df):
    # Suppressing the scientific notation
    pd.set_option('display.float_format', lambda x: '%.5f' % x)
    
    # Converting the columns to lower case for ease of use
    df.columns = df.columns.str.lower()
    
    # Information about the shape of the dataset
    print ("Shape of the dataset is:{}".format(df.shape))
    print ("---------------------------------------------------------")
    
    # Missing columns in the dataset and the percentage of missing values
    missing_cols = []
    missing_pct = []
    for col in df.columns:
        if df[col].isna().sum() > 0:
            missing_cols.append(col)
            missing_pct.append(round((df[col].isna().sum()/len(df))*100,3))
    missing_data = pd.DataFrame({"Pct_Missing":missing_pct},index=missing_cols)
    if len(missing_cols)==0:
        print ("There are no missing values")
    else:
        print ("Missing Data Information")
        print ("---------------------------------------------------------")
        print (missing_data)
    print ("---------------------------------------------------------")
    
    # Descriptive statistic of the numerical data
    numerical_columns = list(df.select_dtypes(exclude="object").columns)
    print ("Descriptive Statistics Of Numerical Variables")
    print ("---------------------------------------------------------")
    print (df[numerical_columns].describe())
    print ("---------------------------------------------------------")
    
    # Checking the number of unique values in the categorical columns in the data
    categorical_columns = list(df.select_dtypes(include="object").columns)
    print ("Number Unique Values in Categorical Columns")
    print ("---------------------------------------------------------")
    for col in categorical_columns:
        print ("Number of Unique Values in {} Column Are:{}".format(col,df[col].nunique()))
    print ("---------------------------------------------------------")
        
    # Getting the range of the datetime columns, if any
    datetype_columns = list(df.select_dtypes(include='datetime').columns)
    if len(datetype_columns) == 0:
        print ("There Are No Datetime Columns In The Dataset")
    else :
        print ("Datetime Columns are:{}".format(datetype_columns))
        print ("---------------------------------------------------------")
        for col in datetype_columns:
            print ("{} Datetime Column Ranges From {} to {}".format(col,df[col].min(),df[col].max()))

In [3]:
def categorical_plots(df,columns):
    # This function basically creates the countplot of all categorical variables.
    # columns should be a list of categorical variables.
    for ind,col in enumerate(columns):
        plt.figure(ind);
        (df[col].value_counts(dropna=False)/len(df)).plot.bar();
        plt.title("Distribution of {} column".format(col));
        plt.xlabel(col);
        plt.ylabel("Percentage");
        plt.xticks(rotation=90);

In [4]:
def distribution_plots(df,columns):
    # This small function basically creates the distribution plot of all numerical variables.
    # columns should be a list of numerical variables.
    for ind,col in enumerate(columns):
        plt.figure(ind);
        sns.distplot(df[col],kde=True);
        plt.title("Distribution Plot of {} column".format(col));
        plt.xlabel(col);

In [5]:
def aggregation(df,group_by,stats,to_aggregate):
    """
    This function basically creates grouped features. group_by is the list of columns to be grouped by, stats is the list of 
    statistical measure such as min, max, mean etc. to_aggregate is the list of columns whose statistical measure is to be found.
    group_by, stats and to_aggregate should be passed as lists.
    """
    for item in group_by:
        for agg in stats:
            for col in to_aggregate:
                df[agg+"_Of_"+col+"_GroupBy_"+item] = df.groupby(item)[col].transform(agg)

In [6]:
def generate_datetime_features(df,columns):
    # This function basically generates different datetime features based on its attributes.
    # columns should be a list of columns that are datetime variables. 

    for col in cols:
        df[col+"_Year"] = df[col].dt.year
        df[col+"_Quarter"] = df[col].dt.quarter
        df[col+"_Month"] = df[col].dt.month
        df[col+"_DayOfMonth"] = df[col].dt.day
        df[col+"_DayOfWeek"] = df[col].dt.dayofweek
        df[col+"_Is_Month_Start"] = df[col].dt.is_month_start
        df[col+"_Is_Month_End"] = df[col].dt.is_month_end
        df[col+"_Is_Quarter_Start"] = df[col].dt.is_quarter_start
        df[col+"_Is_Quarter_End"] = df[col].dt.is_quarter_end