In [2]:
%load_ext kedro.extras.extensions.ipython

In [3]:
%reload_kedro

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
import pandas as pd
import numpy as np

# Define the file names
file_names = ["DAY_{}".format(i) for i in range(1, 31)]

# Create an empty DataFrame to store the data
columns = [
    "Ph1",
    "Ph2",
    "Ir1",
    "Fo1",
    "Fo2",
    "Di3",
    "Di4",
    "Ph3",
    "Ph4",
    "Ph5",
    "Ph6",
    "Co1",
    "Co2",
    "Co3",
    "So1",
    "So2",
    "Di1",
    "Di2",
    "Te1",
    "Fo3",
    "LR1",
    "LR2"
]

df = pd.DataFrame(columns=columns)

# Load data from each file and append to the DataFrame
day = 1
secs = np.arange(1, 86401, 1)
for file_name in file_names:
    file_path = "/home/sossa/raftel/data/01_raw/unloaded/Aras/House_A/{}.txt".format(file_name)  # assuming the files have a .txt extension
    temp_df = pd.read_csv(file_path, sep=' ', header=None, names=columns)
    temp_df['DAY'] = day
    temp_df['SEC'] = secs
    df = pd.concat([df, temp_df], ignore_index=True)
    day = day + 1

print(df)

        Ph1 Ph2 Ir1 Fo1 Fo2 Di3 Di4 Ph3 Ph4 Ph5  ... So1 So2 Di1 Di2 Te1 Fo3  \
0         0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   
1         0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   
2         0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   
3         0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   
4         0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   
...      ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..   
2591995   0   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   
2591996   0   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   
2591997   0   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   
2591998   0   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   
2591999   0   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   

        LR1 LR2   DAY      SEC  
0     

In [7]:
df_spark = spark.createDataFrame(df)

In [8]:
df_spark.printSchema()

root
 |-- Ph1: long (nullable = true)
 |-- Ph2: long (nullable = true)
 |-- Ir1: long (nullable = true)
 |-- Fo1: long (nullable = true)
 |-- Fo2: long (nullable = true)
 |-- Di3: long (nullable = true)
 |-- Di4: long (nullable = true)
 |-- Ph3: long (nullable = true)
 |-- Ph4: long (nullable = true)
 |-- Ph5: long (nullable = true)
 |-- Ph6: long (nullable = true)
 |-- Co1: long (nullable = true)
 |-- Co2: long (nullable = true)
 |-- Co3: long (nullable = true)
 |-- So1: long (nullable = true)
 |-- So2: long (nullable = true)
 |-- Di1: long (nullable = true)
 |-- Di2: long (nullable = true)
 |-- Te1: long (nullable = true)
 |-- Fo3: long (nullable = true)
 |-- LR1: long (nullable = true)
 |-- LR2: long (nullable = true)
 |-- DAY: double (nullable = true)
 |-- SEC: double (nullable = true)



In [9]:
df_spark = df_spark.select("DAY", "SEC", *columns)

In [10]:
catalog.save("aras_a@spark", df_spark)

24/03/04 13:24:02 WARN TaskSetManager: Stage 0 contains a task of very large size (1883 KiB). The maximum recommended task size is 1000 KiB.
24/03/04 13:24:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/03/04 13:24:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/03/04 13:24:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/03/04 13:24:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/03/04 13:24:03 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/03/04 13:24:04 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,

In [None]:
df_spark.count()

In [None]:
labels = df_spark.select("LR1", "LR2").distinct().withColumn("LABEL", F.monotonically_increasing_id())

In [None]:
df_spark = df_spark.join(
    labels,
    ["LR1", "LR2"],
    "LEFT"
).orderBy(
    "SEC"
)

In [None]:
df_spark.write.partitionBy("DAY").mode("append").format("delta").option("overwriteSchema", "true").save("/home/sossa/raftel/data/01_raw/aras_a")