<h1>Data Exploration Plan</h1>

The following lists some of the common steps in data exploration with Python and Pandas



<h3> Step-1: Load the necessary modules:</h3>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h3>Step-2: Load Files</h3>

In [None]:
# for plain text which can split by patterns
df = pd.load_table('/path/to/file_name.txt', sep='\s*\|\s*', engine='python', na_values=['NA', 'EMPTY'])
# for CSV file
df = pd.load_csv('/path/to/file_name.txt', index_col=['Year', 'Month'])
# for fixed-width text file
df = pd.load_fws('/path/to/file_name.txt', widths=[10,20,3,30,2])
# parquet files
df = pd.read_parquet('/path/to/file.parquet', engine='pyarrow')

<h3>Step-3: Check Basic Data information</h3>

In [None]:
# dimensions
df.shape
# data types per each columns, indexes
df.info()
# major descriptive statistics: object(count, unique, top, freq), numbers(first,last,mean,std,min,percentiles,max)
df.describe(include='all')

<h3>Step-4: Check missing data</h3>

In [None]:
# total number of Null per columns
df.isnull().sum()
# percentage NaN per columns
df.isnull().apply(lambda x: '{0:.0f}%'.format(sum(x)/len(x)*100))  
# drop columns 
df.dropna(thresh=N)       

<h3>Step-5: for dtypes=object </h3>

In [None]:
# how many unique
df.col1.unique().size/d1.col1.size

<h3>Step-6: for dtypes in (int, float)</h3>

In [None]:
# convert string field to to int if needed
for field in ['f1', 'f2']
df[field] = pd.to_numeric(df[field], errors='coerce')

# correlation default method=’pearson’
df.corr()

# scatter_matrix
pd.plotting.scatter_matrix(df,figsize=(15,15),diagonal('kde'))
plt.show()

# heatmap
corr = df.corr()
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.show()

# scatter
plt.scatter(df1.mean(), df1.std(), s=120, c=’green’)

# boxplot
df.plot(kind='box')       ← percentile, range etc


<h3>Step-7: for DateTime field</h3>

In [None]:
# convert to the formal datetime
for field in ['dfield1', 'dfield1']:
    df[field] = pd.to_datetime(df[field], format='%m/%d/%Y')

# for a time series, using reindex can easily find the missing data which labelled NaN by default
# make sure the datefield is set as index and no duplicate
myindex = pd.date_range('2017-01-01', '2017-03-01', freq='D')
mydf.set_index('date_field').reindex(myindex)