# PANDAS

pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [None]:
import numpy as np
import pandas as pd

In [None]:
# create a pandas series
s = pd.Series([1, 3, 5, np.nan, 6, 7])

In [None]:
s

In [None]:
s.shape

In [None]:
# create a range of dates, date provided is start date, periods increments by day
dates = pd.date_range('20130101', periods=6)
print(dates)

In [None]:
# create a dataframe from random data using the dates variable created earlier as the index
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print (df)

In [None]:
df.head(100) # get first 2 records, default is 5

In [None]:
df.tail(2) # get last 2 records, default is 5

In [None]:
df.index # view indexs

In [None]:
df.columns # view available columns

In [None]:
df['B'] # view specific columns

In [None]:
df.describe() # inspect dataset

In [None]:
df.T # transpose

In [None]:
df.sort_index(axis=1, ascending=False) # sort by axis

# ingest excel files

In [None]:
xl = pd.read_excel("./docs/exampleexcel.xlsx")

In [None]:
xl

In [None]:
xl.head()

In [None]:
xl['Example column three'].describe()

In [None]:
xl['Example column 2'] = xl['Example column 2'] * 2

In [None]:
xl.head()

# bro log analysis

imngest weird.log

In [None]:
df = pd.read_csv("./docs/weird.log", 
                 sep='\t', 
                 names=['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_p', 'name', 'addl', 'notice', 'peer'])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df_clean = df.replace({'-': None}).replace(np.nan, None)
df_clean.head()

In [None]:
#DataFrame with data types
data_types = pd.DataFrame(df_clean.dtypes, columns=['Data Type'])

#DataFrame with Count
data_count = pd.DataFrame(df_clean.count(), columns=['Count'])

#DataFrame with unique values
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(df_clean.columns.values):
    unique_value_counts.loc[v] = [df_clean[v].nunique()]

missing_data_counts = pd.DataFrame(df_clean.isnull().sum(), columns=['Missing Values'])
data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)
data_quality_report

In [None]:
df_ip = df_clean[df_clean['uid']
                 .where(df_clean['uid'] == '192.168.202.79')
                 .notnull()]

In [None]:
df_ip.describe()

In [None]:
df_ip.head()

In [None]:
df_ip.where(df_ip['id.orig_p'] == '192.168.229.101').head()

In [None]:
df_ip.where(df_ip['id.orig_p'] == '192.168.229.101').dropna(thresh=2).head()

In [None]:
col_1 = 'uid'
col_2 = 'id.orig_p'
ip_1 = '192.168.202.79'
ip_2 = '192.168.229.101'
df_clean[df_clean[col_1]
                 .where(df_clean[col_1] == ip_1)
                 .notnull()].where(df_ip[col_2] == ip_2).dropna(thresh=2).head()