In [1]:
import gzip
import json
import pandas as pd
from datetime import datetime
from pandas import json_normalize
import io 

#Create the with open block so that you can read the raw contents of you json file for aws"

#Use this to silence all future warnings
pd.set_option('future.no_silent_downcasting', True)


#We want to analyze each line as its own events in order 

#This can only go through one event, we want to go through multiple events and logs(multiple JSON objects) in order to train our model
#with open("my_aws.json","r") as f:
    #data = json.load(f)


#Step 1: Create a list to store extracted JSON objects
#extracted_aws_events = []

#Step 2: We must go through each obejct and read the JSON file line by line
#This is for NSJSON
#We can use a for loop to go through and use the json.loads() function for each line
#with open('my_aws.json', 'r') as file:
   # for line in file:
        #aws_obj = json.loads(line)
        #extracted_aws_events.append(aws_obj)

#We use this for multiple JSON objects and we are extracting the inner records list for all the multiple JSON objects
with open("aws_logs_5000.json", "r") as file:
    #Now we have to normalize the data to flatten the nested strcuture and seprate the line
    #We are grabbing the real CloudTrail events so in this case of 10+ event dicts
    df = pd.json_normalize(json.load(file)['Records'], sep="_")


#Now we can clean our data - means getting rid of any bad data in our JSON file

#This function allows us the fill in any value in our JSON file that has any null values just test out our function
df.fillna({'requestParameters': 'value'}, inplace=True)

#Using this fucntion to handle any more remaining nulls or inconsistent data
null = df.isnull().sum()
print(null)

#Filtering our time and making it into a datatype object
df['eventTime'] = pd.to_datetime(df['eventTime'])

#Allows us to  filter our data based on position based selection
df.iloc[0:1,0:10]

#We want to remove the duplicate that is contained within our dataset 
# we have  unhashable lists/dictionaries so we need to change this, we do this so we dont get an TypeError
#We select on the coloumns where all values are hashable
hashable_cols = [
    c for c in df.columns
    if df[c].apply(lambda v: isinstance(v, (int, float, str, bool, pd.Timestamp, type(None)))).all()
    ]



#Check to see if we have any duplicates
# print(df.duplicated()) 

#If we do, we can remove them with this fucntion

#We want to add hashable_cols since we 
#We will add our subset so that we have hashabel_cols 

df.drop_duplicates(subset=hashable_cols, inplace = True)

#Now we can do feature engineering - transdoms raw data into informatives features for machine learning models

#We want to extract certain data and turn them into features

# Extracting Date from our DateTime Object
#df['date'] = df['eventTime'].dt.date

# Extracting Time from our DateTime Object
df['time'] = df['eventTime'].dt.time

# Extracting Hour from our DateTime Object
df['hour'] = df['eventTime'].dt.hour

# Extracting Month from our DateTime Object
df['month'] = df['eventTime'].dt.month_name()

# Extracting Day from our DateTime Object
df['day'] = df['eventTime'].dt.day_name()

# Extracting Year from our DateTime Object
df['year'] = df['eventTime'].dt.year

#Now we want to encode categorical data such as eventName and userIdentity.type
#We can do labeling encoding(coverting column into number)  or one hot encoding(convert each catergory value into a new column and assign
# a True or False value to it (1(True) / 0(False)

#One Time Encoding 
#You can encode everything once
df = pd.get_dummies(df, columns=["eventName","sourceIPAddress", "userAgent", "awsRegion"])

#Turn your data that you want to(e.g. suspicious IP Addresses) into numeric data
#For this value we convert the type to boolean so it doesnt allow missing values
#Then, we replace any the missing values with False, finally convert the booleans
#to integer so we can encode it
df['is_malicious'] = (
    df['is_malicious']
    .astype('bool')
    .fillna(False)
    .astype('int8')
)

#Now we want to keep what we want to feed our maching learning model and drop the rest of the columns
cols_to_drop = ['eventTime', 'time', 'day'] #We are keeping the columns for hour, year, date, and month

#Also drop the null long string
cols_to_drop += ['requestParameters', 'responseElements']
df.drop(columns=cols_to_drop, inplace=True)

#Now we can save all of our data to a csv file
df.to_csv('train_aws_5000.csv',index=False)
            





eventVersion                                                0
eventTime                                                   0
eventSource                                                 0
eventName                                                   0
awsRegion                                                   0
sourceIPAddress                                             0
userAgent                                                   0
requestParameters                                           0
responseElements                                           50
requestID                                                   0
eventID                                                     0
readOnly                                                    0
eventType                                                   0
managementEvent                                             0
recipientAccountId                                          0
eventCategory                                               0
sessionC