# Initial Data Analysis ....  
### Using Pandas  

In [1]:
# Demo 1 - the basics
import pandas as pd

# Demo 2
from pivottablejs import pivot_ui

# Demo 3
import pandas_profiling

# future ...
# from pydqc.data_summary import distribution_summary_pretty
# https://towardsdatascience.com/introducing-pydqc-7f23d04076b3

In [2]:
# STANDARD IMPORTS  
from datetime import datetime
import glob
import json
import math
import io
import os
import csv
from flatten_json import flatten
import operator
from collections import Counter

# Upgraded Pandas
import numpy as np
from numpy import nan
import matplotlib.pyplot as plt

# AWS
import boto3


#### Get Some Data

In [None]:
# AWS S3 ....

s3 = boto3.resource('s3')

# Assume files are in S3 bucket - source!
SourceBucket = "BucketName"
# Download to with 'rawData' folder. 
downloadFolder = './rawData'
if not os.path.exists(downloadFolder):
    os.makedirs(downloadFolder)
downloadFolder = downloadFolder + '/'

# The Data
s3file1 = "folder/folder/file1.csv"
local1 = "file1.csv"

# Downloading files if not already.
if not os.path.isfile(local1):
    s3.Bucket(SourceBucket).download_file(s3file1, downloadFolder + local1) 
    
#create dataframes
df = pd.read_csv(downloadFolder + local1, header=0, sep=',', quotechar='"', encoding = "utf-8")


In [3]:
# Or just direct from HTTPS source ...
#url="https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
url='https://storage.googleapis.com/kaggle-competitions-data/kaggle/3136/train.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1569957816&Signature=TlH3b%2Bxvu7gzUkSkmVjUda0fAOzzap52wsv5GbK5gOrOwnF%2FAnYF3zp16Jp0EECfBgAGC3aMaxmR1R8hF8XPCOOsq3hep2aMbWIyjJU1QN0qlcPLQQhNWlJwCPFneQ%2BrgjdvKtgHoVEIwobK2tSFSM7ndL1bVqe3awhPp2f0SSfo40uRf9eIjl4x9fB%2BXK2Agvp9wTOR1DrJYiCV3kIakzs5N8bQMi%2BU0de7cM0HzaYXehned1i42tgGDeTyRQTgYdQWT8jCdW6jfhChQamVw2x10jFRqsJ2AuW471TXYkLfvcWEmPhqwnAcKiTwfkZBzgvt66lqacxzKE0VQPobFw%3D%3D'
local1 = "file1.csv"

df=pd.read_csv(url)

## 0 - Basics

In [4]:
#Standard pandas settings  
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1
pd.set_option('mode.chained_assignment', None) # disable the SettingwithCopyWarning

In [5]:
# good step - include some meta data in the dataframe
df.name = local1
df['fileName'] = local1

In [6]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fileName
761,762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,file1.csv
655,656,0,2,"Hickman, Mr. Leonard Mark",male,24.0,2,0,S.O.C. 14879,73.5,,S,file1.csv
360,361,0,3,"Skoog, Mr. Wilhelm",male,40.0,1,4,347088,27.9,,S,file1.csv
588,589,0,3,"Gilinski, Mr. Eliezer",male,22.0,0,0,14973,8.05,,S,file1.csv
567,568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,file1.csv


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
fileName       891 non-null object
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


In [10]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'fileName'],
      dtype='object')

In [13]:
df.dtypes

PassengerId    int64  
Survived       int64  
Pclass         int64  
Name           object 
Sex            object 
Age            float64
SibSp          int64  
Parch          int64  
Ticket         object 
Fare           float64
Cabin          object 
Embarked       object 
fileName       object 
dtype: object

## 1 - Agregations

In [None]:
# Define summary aggregations as a template
aggregations = {'PassengerId':['count','nunique'],'Sex':['count','nunique'], 'Fare' : ['min', 'mean', 'max'], 'Age' : ['min', 'mean', 'max']}


In [None]:
# this does initial, and then creates "total" using the fileName column
df_summary = df.groupby(['Pclass']).agg(aggregations)
df_summary = df_summary.append(df.groupby(['fileName']).agg(aggregations))
df_summary

## 2 - Pivot Table

In [None]:
pivot_ui(df)

## 3 - Profiling

In [None]:
pandas_profiling.ProfileReport(df)
# can also send to file as html
# .to_file(local1 + "_profile.html")

# Done