In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [5]:
features = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack", "last_flag"]

print(len(features))

43


In [6]:
train_df = spark.read.csv("Train.txt", inferSchema=True)
test_df = spark.read.csv("Test.txt", inferSchema=True)
train_df = train_df.toDF(*features)

train_df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/hoangtv/PycharmProjects/BigData/network_anomaly_detection/Train.txt.

In [None]:
from pyspark.sql.types import NumericType

# Get a list of all non-numeric columns
non_numeric_cols = [field.name for field in train_df.schema.fields if not isinstance(field.dataType, NumericType)]

# Select non-numeric columns from the DataFrame
non_numeric_df = train_df.select(non_numeric_cols)

non_numeric_df.show()

# Count Distinct values in each non-numeric column

In [None]:
column_to_drop = ['land', 'urgent', 'numfailedlogins', 'numoutboundcmds']

train_df.drop(*column_to_drop)

In [None]:
from pyspark.sql.functions import col

# Check for null values and count them
missing_value = train_df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in train_df.columns]).toPandas().T
missing_value = missing_value.rename(columns={0: 'count'})
print(missing_value)

In [None]:
from pyspark.sql.functions import when, col

train_df = train_df.withColumn('attack', when(col('attack') != 'normal', 'attack').otherwise(col('attack')))

In [None]:
from pyspark.ml.feature import StringIndexer

# List of columns to be encoded
columns_to_encode = ['protocol_type', 'service', 'flag', 'attack']

# Apply StringIndexer to each column in the list
for column in columns_to_encode:
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index")
    train_df = indexer.fit(train_df).transform(train_df)

train_df = train_df.drop(*columns_to_encode)

# Now each of the columns 'protocoltype', 'service', 'flag', 'attack' 
# has an additional corresponding column with suffix '_index' that contains its numerical representation.

In [None]:
from pyspark.ml.feature import VectorAssembler

# Danh sách các cột đầu vào
input_columns = [column for column in train_df.columns if column != 'attack']

# Khởi tạo đối tượng VectorAssembler (tạo vector đầu vào)
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

train_data = assembler.transform(train_df)

In [None]:
from pyspark.ml.feature import StandardScaler

# Assume that 'features' is the column containing the feature vectors
# Create a StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(train_data)

# Normalize each feature to have unit standard deviation.
train_data = scalerModel.transform(train_data)

In [None]:
# Chia dữ liệu thành 2 tập train và test
train_data, test_data = train_data.randomSplit([0.8, 0.2], seed=0)

In [None]:
from pyspark.ml.regression import LinearRegression

# Create a Linear Regression Estimator
lr = LinearRegression(featuresCol='features', labelCol='attack_index')

lr_model = lr.fit(train_data)


In [None]:
predictions = lr_model.transform(test_data)

predictions.select('attack_index', 'prediction').show()