# Pandas Profiling Demo
Demonstration of [pandas-profiling package](https://github.com/pandas-profiling/pandas-profiling)
### Setup
Uses 'names' package for generating random person names.

In [1]:
import pandas as pd
import numpy as np
import random
import names   # random name generator - pip install names
import pandas_profiling    # pip install pandas_profiling

In [2]:
%%javascript
// Prevent vertical scrollbars on tables
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

### Create Mock Data
n records of fake person data, with nonsensical attributes to exercise pandas-profiling's statistical analysis.

In [3]:
mock_data = []
for x in range(10000):
    person_id = x
    first_name = names.get_first_name()
    last_name = names.get_last_name()
    phone_number = '+61-{}-{:04d}-{:04d}'.format(
        random.randint(2, 9),
        random.randint(1, 9999),
        random.randint(1, 9999)
    )
    some_val_1 = person_id + 1
    some_val_2 = first_name[::-1]  # reversed via slice
    some_val_3 = last_name + first_name
    # randomly leave last two attributes blank for some records
    if random.randint(1, 5) == 1:
        some_val_4 = np.NaN
        some_val_5 = np.NaN
    else:
        some_val_4 = person_id * random.randint(1, 9)
        some_val_5 = random.randint(-9999999, 9999999)
    person_record = {
        'person_id': person_id, 'first_name': first_name, 'last_name': last_name, 
        'phone_number': phone_number, 'some_val_1': some_val_1, 'some_val_2': some_val_2, 
        'some_val_3': some_val_3, 'some_val_4': some_val_4, 'some_val_5': some_val_5 
    }
    mock_data.append(person_record)
    
print(mock_data[0])

{'person_id': 0, 'first_name': 'Elizabeth', 'last_name': 'Natale', 'phone_number': '+61-4-9966-5937', 'some_val_1': 1, 'some_val_2': 'htebazilE', 'some_val_3': 'NataleElizabeth', 'some_val_4': 0, 'some_val_5': -4809518}


In [4]:
df = pd.DataFrame.from_dict(mock_data)
df.head()

Unnamed: 0,first_name,last_name,person_id,phone_number,some_val_1,some_val_2,some_val_3,some_val_4,some_val_5
0,Elizabeth,Natale,0,+61-4-9966-5937,1,htebazilE,NataleElizabeth,0.0,-4809518.0
1,Curtis,Highshaw,1,+61-7-9260-1203,2,sitruC,HighshawCurtis,,
2,Ashley,Manna,2,+61-3-2004-9386,3,yelhsA,MannaAshley,6.0,5837060.0
3,Kelli,Neuhoff,3,+61-6-8302-1337,4,illeK,NeuhoffKelli,3.0,-1476896.0
4,Martin,Ballard,4,+61-3-4827-8692,5,nitraM,BallardMartin,24.0,8954052.0


### Generate & Display Report

In [5]:
profile = pandas_profiling.ProfileReport(df)
display(profile)
# can output to file...
# profile.to_file(outputfile="/tmp/myoutputfile.html")

0,1
Number of variables,9
Number of observations,10000
Total Missing (%),4.4%
Total size in memory,703.2 KiB
Average record size in memory,72.0 B

0,1
Numeric,3
Categorical,4
Boolean,0
Date,0
Text (Unique),1
Rejected,1
Unsupported,0

0,1
Distinct count,1823
Unique (%),18.2%
Missing (%),0.0%
Missing (n),0

0,1
James,187
Robert,174
John,169
Other values (1820),9470

Value,Count,Frequency (%),Unnamed: 3
James,187,1.9%,
Robert,174,1.7%,
John,169,1.7%,
Mary,163,1.6%,
David,146,1.5%,
Michael,146,1.5%,
William,121,1.2%,
Richard,91,0.9%,
Charles,81,0.8%,
Joseph,78,0.8%,

0,1
Distinct count,5347
Unique (%),53.5%
Missing (%),0.0%
Missing (n),0

0,1
Smith,111
Johnson,88
Jones,70
Other values (5344),9731

Value,Count,Frequency (%),Unnamed: 3
Smith,111,1.1%,
Johnson,88,0.9%,
Jones,70,0.7%,
Davis,65,0.7%,
Williams,62,0.6%,
Brown,57,0.6%,
Miller,49,0.5%,
Taylor,39,0.4%,
Moore,38,0.4%,
Martin,38,0.4%,

0,1
Distinct count,10000
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,4999.5
Minimum,0
Maximum,9999
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,499.95
Q1,2499.8
Median,4999.5
Q3,7499.2
95-th percentile,9499.0
Maximum,9999.0
Range,9999.0
Interquartile range,4999.5

0,1
Standard deviation,2886.9
Coef of variation,0.57744
Kurtosis,-1.2
Mean,4999.5
MAD,2500
Skewness,0
Sum,49995000
Variance,8334200
Memory size,78.2 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
9518,1,0.0%,
7481,1,0.0%,
5432,1,0.0%,
9526,1,0.0%,
3379,1,0.0%,
1330,1,0.0%,
7473,1,0.0%,
5424,1,0.0%,
3371,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9995,1,0.0%,
9996,1,0.0%,
9997,1,0.0%,
9998,1,0.0%,
9999,1,0.0%,

First 3 values
+61-8-7025-6696
+61-3-3833-4588
+61-6-0976-7115

Last 3 values
+61-3-9856-1955
+61-6-9525-4959
+61-2-3288-2202

Value,Count,Frequency (%),Unnamed: 3
+61-2-0006-7263,1,0.0%,
+61-2-0009-0986,1,0.0%,
+61-2-0010-9639,1,0.0%,
+61-2-0024-2247,1,0.0%,
+61-2-0029-2712,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
+61-9-9924-2759,1,0.0%,
+61-9-9945-7237,1,0.0%,
+61-9-9949-2582,1,0.0%,
+61-9-9962-3979,1,0.0%,
+61-9-9985-8898,1,0.0%,

0,1
Correlation,1

0,1
Distinct count,1823
Unique (%),18.2%
Missing (%),0.0%
Missing (n),0

0,1
semaJ,187
treboR,174
nhoJ,169
Other values (1820),9470

Value,Count,Frequency (%),Unnamed: 3
semaJ,187,1.9%,
treboR,174,1.7%,
nhoJ,169,1.7%,
yraM,163,1.6%,
leahciM,146,1.5%,
divaD,146,1.5%,
mailliW,121,1.2%,
drahciR,91,0.9%,
selrahC,81,0.8%,
hpesoJ,78,0.8%,

0,1
Distinct count,9873
Unique (%),98.7%
Missing (%),0.0%
Missing (n),0

0,1
LewisMary,4
JohnsonJames,3
JamesJohn,3
Other values (9870),9990

Value,Count,Frequency (%),Unnamed: 3
LewisMary,4,0.0%,
JohnsonJames,3,0.0%,
JamesJohn,3,0.0%,
MillerJohn,3,0.0%,
MillerWilliam,3,0.0%,
JohnsonRobert,3,0.0%,
BrownMary,2,0.0%,
HarrisJose,2,0.0%,
ButtsDavid,2,0.0%,
SmithLarry,2,0.0%,

0,1
Distinct count,7447
Unique (%),74.5%
Missing (%),19.6%
Missing (n),1964
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,25191
Minimum,0
Maximum,89964
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,1585.8
Q1,7793.5
Median,19178.0
Q3,38259.0
95-th percentile,67824.0
Maximum,89964.0
Range,89964.0
Interquartile range,30466.0

0,1
Standard deviation,20980
Coef of variation,0.83282
Kurtosis,0.047025
Mean,25191
MAD,17240
Skewness,0.92563
Sum,202440000
Variance,440140000
Memory size,78.2 KiB

Value,Count,Frequency (%),Unnamed: 3
3990.0,4,0.0%,
21120.0,4,0.0%,
33520.0,3,0.0%,
20286.0,3,0.0%,
5940.0,3,0.0%,
9832.0,3,0.0%,
5360.0,3,0.0%,
1050.0,3,0.0%,
9582.0,3,0.0%,
696.0,3,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1,0.0%,
3.0,1,0.0%,
6.0,1,0.0%,
8.0,1,0.0%,
18.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
89775.0,1,0.0%,
89856.0,1,0.0%,
89946.0,1,0.0%,
89955.0,1,0.0%,
89964.0,1,0.0%,

0,1
Distinct count,8037
Unique (%),80.4%
Missing (%),19.6%
Missing (n),1964
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,94181
Minimum,-9997000
Maximum,9986500
Zeros (%),0.0%

0,1
Minimum,-9997000
5-th percentile,-8971500
Q1,-4979000
Median,143100
Q3,5174700
95-th percentile,8980100
Maximum,9986500
Range,19983000
Interquartile range,10154000

0,1
Standard deviation,5799000
Coef of variation,61.573
Kurtosis,-1.2167
Mean,94181
MAD,5031500
Skewness,-0.024267
Sum,756830000
Variance,33628000000000
Memory size,78.2 KiB

Value,Count,Frequency (%),Unnamed: 3
-1649321.0,1,0.0%,
5024162.0,1,0.0%,
1211447.0,1,0.0%,
-6465998.0,1,0.0%,
-529726.0,1,0.0%,
3364095.0,1,0.0%,
5941774.0,1,0.0%,
-8580570.0,1,0.0%,
-2643230.0,1,0.0%,
2839869.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-9996952.0,1,0.0%,
-9996046.0,1,0.0%,
-9993585.0,1,0.0%,
-9992786.0,1,0.0%,
-9990648.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9983005.0,1,0.0%,
9983073.0,1,0.0%,
9985432.0,1,0.0%,
9985577.0,1,0.0%,
9986510.0,1,0.0%,

Unnamed: 0,first_name,last_name,person_id,phone_number,some_val_1,some_val_2,some_val_3,some_val_4,some_val_5
0,Elizabeth,Natale,0,+61-4-9966-5937,1,htebazilE,NataleElizabeth,0.0,-4809518.0
1,Curtis,Highshaw,1,+61-7-9260-1203,2,sitruC,HighshawCurtis,,
2,Ashley,Manna,2,+61-3-2004-9386,3,yelhsA,MannaAshley,6.0,5837060.0
3,Kelli,Neuhoff,3,+61-6-8302-1337,4,illeK,NeuhoffKelli,3.0,-1476896.0
4,Martin,Ballard,4,+61-3-4827-8692,5,nitraM,BallardMartin,24.0,8954052.0
