In [1]:
from datetime import datetime

def count_time(func):
    def wrapper(*args, **kwargs):
        start = datetime.now()
        result = func(*args, **kwargs)
        elapsed_time = datetime.now() - start
        print(f"Czas wczytywania {func.__name__}: {elapsed_time.total_seconds()} sekund")
        return result
    return wrapper

## Zadanie 1

In [3]:
import pandas as pd
import numpy as np

pandas_df = pd.read_csv('zamowienia.csv', delimiter=';')

num_rows = pandas_df.shape[0]

num_missing = 5
missing_indices = np.random.choice(range(10, num_rows), size=num_missing, replace=False)

for idx in missing_indices:
    col_to_nan = np.random.choice(pandas_df.columns)
    pandas_df.at[idx, col_to_nan] = np.nan

pandas_df.head()

Unnamed: 0,Kraj,Sprzedawca,Data zamowienia,idZamowienia,Utarg
0,Polska,Kowalski,2003-07-16,10248.0,440.0
1,Polska,Sowiński,2003-07-10,10249.0,1863.4
2,Niemcy,Peacock,2003-07-12,10250.0,1552.6
3,Niemcy,Leverling,2003-07-15,10251.0,654.06
4,Niemcy,Peacock,2003-07-11,10252.0,3597.9


In [4]:
pandas_df.dtypes

Kraj                object
Sprzedawca          object
Data zamowienia     object
idZamowienia       float64
Utarg              float64
dtype: object

In [5]:
pandas_df.to_csv('zamowienia_missing.csv')

In [6]:
from dask.distributed import Client
import dask.dataframe as dd

client = Client(n_workers=6, memory_limit='8GB') 
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 6
Total threads: 12,Total memory: 44.70 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:59950,Workers: 6
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 44.70 GiB

0,1
Comm: tcp://127.0.0.1:59965,Total threads: 2
Dashboard: http://127.0.0.1:59967/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59953,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-mah84njj,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-mah84njj

0,1
Comm: tcp://127.0.0.1:59969,Total threads: 2
Dashboard: http://127.0.0.1:59973/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59955,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-qy0thqjm,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-qy0thqjm

0,1
Comm: tcp://127.0.0.1:59970,Total threads: 2
Dashboard: http://127.0.0.1:59975/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59957,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-uptcr_c2,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-uptcr_c2

0,1
Comm: tcp://127.0.0.1:59978,Total threads: 2
Dashboard: http://127.0.0.1:59981/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59959,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-i46v5idy,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-i46v5idy

0,1
Comm: tcp://127.0.0.1:59966,Total threads: 2
Dashboard: http://127.0.0.1:59971/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59961,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-ci5gp0e0,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-ci5gp0e0

0,1
Comm: tcp://127.0.0.1:59977,Total threads: 2
Dashboard: http://127.0.0.1:59979/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59963,
Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-ncw2jt9b,Local directory: /var/folders/n_/qg4nj6yj6kb15d2fxt3zlgqh0000gn/T/dask-scratch-space/worker-ncw2jt9b


In [7]:
client.scheduler_info

<bound method Client.scheduler_info of <Client: 'tcp://127.0.0.1:59950' processes=6 threads=12, memory=44.70 GiB>>

In [8]:
ddf = dd.read_csv('zamowienia_missing.csv')
ddf.dtypes

Unnamed: 0                   int64
Kraj               string[pyarrow]
Sprzedawca         string[pyarrow]
Data zamowienia    string[pyarrow]
idZamowienia               float64
Utarg                      float64
dtype: object

### Typy Object zostały zmienione na string[pyarrow]

In [10]:
try:
    numeric_cols = ddf.select_dtypes(include='number')
    result = numeric_cols.mean().compute()
    print("\nWynik obliczeń:")
    print(result)
except Exception as e:
    print("\nBłąd podczas obliczeń:", e)


Wynik obliczeń:
Unnamed: 0        399.000000
idZamowienia    10646.444166
Utarg            1538.235383
dtype: float64


In [11]:
samples_values = [75, 250, 500]

for samples in samples_values:
    try:
        ddf_samples = dd.read_csv('zamowienia_missing.csv', sample=samples)
        print(f"\nRamka Dask wczytana z próbkowaniem {samples}:")
        print(ddf_samples.head())
    except Exception as e:
        print(f"\nBłąd przy próbie wczytania z próbkowaniem {samples}: {e}")


Ramka Dask wczytana z próbkowaniem 75:
   Unnamed: 0    Kraj Sprzedawca Data zamowienia  idZamowienia    Utarg
0           0  Polska   Kowalski      2003-07-16       10248.0   440.00
1           1  Polska   Sowiński      2003-07-10       10249.0  1863.40
2           2  Niemcy    Peacock      2003-07-12       10250.0  1552.60
3           3  Niemcy  Leverling      2003-07-15       10251.0   654.06
4           4  Niemcy    Peacock      2003-07-11       10252.0  3597.90

Ramka Dask wczytana z próbkowaniem 250:
   Unnamed: 0    Kraj Sprzedawca Data zamowienia  idZamowienia    Utarg
0           0  Polska   Kowalski      2003-07-16       10248.0   440.00
1           1  Polska   Sowiński      2003-07-10       10249.0  1863.40
2           2  Niemcy    Peacock      2003-07-12       10250.0  1552.60
3           3  Niemcy  Leverling      2003-07-15       10251.0   654.06
4           4  Niemcy    Peacock      2003-07-11       10252.0  3597.90

Ramka Dask wczytana z próbkowaniem 500:
   Unnamed: 0 

## Zadanie 2

In [13]:
# client = Client()
# client

## Zadanie 3

In [15]:
import dask.dataframe as dd

path_to_parquet = './*.parquet'

@count_time
def read_parquet():
    return dd.read_parquet(path_to_parquet)

dask_df_parquet = read_parquet()
dask_df_parquet.head()

Czas wczytywania read_parquet: 0.033039 sekund


Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
0,28370919,3496776,BXdjjUlgcgq,2237947779,2017-08-06 20:06:57,2,Wreckloose! Deevalley bike park laps on the @i...,80,0,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,travel_&_adventure
1,13623950,3496776,BeyPed5hKj9,2237947779,2018-02-04 19:35:20,1,The dirty south was prime today. Top day with ...,86,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
3,28370907,3496776,Bppi85gliQK,2237947779,2018-11-01 20:17:41,1,"On the tools, my favourite wheel builds @stans...",102,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life


In [16]:
dask_df_parquet.dtypes

sid                              int64
sid_profile                      int64
post_id                string[pyarrow]
profile_id                       int64
date                   string[pyarrow]
post_type                        int64
description            string[pyarrow]
likes                            int64
comments                         int64
username               string[pyarrow]
bio                    string[pyarrow]
following                        int64
followers                        int64
num_posts                        int64
is_business_account               bool
lang                   string[pyarrow]
category               string[pyarrow]
dtype: object

## Zadanie 4

In [18]:
@count_time
def get_n_largest(df, n, column: str):
    return df.nlargest(n, column)

n = 10
largest_likes = get_n_largest(dask_df_parquet, n, 'likes')
largest_likes.head(n)

Czas wczytywania get_n_largest: 0.004871 sekund


Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
263914,4768909,531404,Bt3yaXmAM0d,49067778,2019-02-14 18:07:43,1,"You make me very happy. Happy Valentine’s Day,...",8822952,165886,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,diaries_&_daily_life
263910,4768904,531404,BuwffB7g8cK,49067778,2019-03-08 18:39:02,1,I am so in awe of @colesprouse and @haleyluhoo...,5447066,25303,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,film_tv_&_video
263911,4768906,531404,Buc1gyZAaMQ,49067778,2019-03-01 03:26:42,1,"Thinking of you, Luke. And praying for your sa...",5116398,40608,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,relationships
263908,4768902,531404,BvcukE8AMuG,49067778,2019-03-25 22:57:23,1,Hire us to perform at your sweet sixteen X,4271466,30770,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,music
263915,4768910,531404,Btw8pFXAvQm,49067778,2019-02-12 02:22:25,1,The sun came out for us.,3558599,15715,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,diaries_&_daily_life
384081,4935518,397587,BvC3n6_gqin,2153680783,2019-03-15 21:56:18,1,DIAMOND TIARA 💎💙 check out my new video with @...,3533326,59475,jamescharles,Unleash your inner artist 🌈,303,15167675,739,False,en,film_tv_&_video
592831,15431581,3520880,Bw7vqLrFkHi,20315007,2019-05-01 22:34:48,1,I lost a game of Jenga to Jessica Cornish. \nT...,3340297,149489,channingtatum,Hi. I used to be a stripper. Now i just create...,839,17484335,67,False,en,gaming
115579,25415502,91738,Bw2gTH1AiDA,35306961,2019-04-29 21:44:23,1,how am feeling after that episode. not today b...,3311246,64318,maisie_williams,@daisieapp,964,11637123,826,False,en,diaries_&_daily_life
263917,4768912,531404,BtJxgNSgKSC,49067778,2019-01-27 21:14:43,1,Happy Sunday: the day of taking a nap right af...,3198242,13730,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,diaries_&_daily_life
263909,4768903,531404,BvCQYRVgys4,49067778,2019-03-15 16:13:22,1,Go see this movie. Bring tissues. Don’t wear m...,2900455,20614,lilireinhart,100% High Fructose Corn Syrup,639,16825239,849,False,en,film_tv_&_video


In [19]:
@count_time
def filter_by_date(df, date_range):
    return df[(df['date'] >= date_range[0]) & (df['date'] < date_range[1])]

date_range = ('2019-01-01', '2019-07-01')
filtered_by_date = filter_by_date(dask_df_parquet, date_range)
filtered_by_date.head()

Czas wczytywania filter_by_date: 0.004794 sekund


Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
5,14315358,3496776,BxJsMDpA2yH,2237947779,2019-05-07 08:33:51,1,Annual springtime flora picture.\nTurn bars in...,124,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,arts_&_culture
6,8304346,3496776,Bt5LFpZlm3z,2237947779,2019-02-15 08:02:35,1,Laps in spring like conditions. Getting these ...,150,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports
7,14315346,3496776,BxZIzaQhS-o,2237947779,2019-05-13 08:32:30,1,Cheers Scotland 🏴󠁧󠁢󠁳󠁣󠁴󠁿 See you in a few weeks...,166,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports


## Zadanie 5

In [21]:
dtypes = {
    "sid": "integer",
    "sid_profile": "integer",
    "post_id": "string[pyarrow]",
    "profile_id": "integer",
    "date": "datetime",
    "post_type": "integer",
    "description": "string[pyarrow]",
    "likes": "integer",
    "comments": "integer",
    "username": "category",
    "bio": "string[pyarrow]",
    "following": "integer",
    "followers": "integer",
    "num_posts": "integer",
    "is_business_account": "boolean",
    "lang": "category",
    "category": "category",
}

@count_time
def read_parquet_with_dtypes():
    return dd.read_parquet(path_to_parquet, dtype=dtypes)

optimized_ddf_parquet = read_parquet_with_dtypes()

Czas wczytywania read_parquet_with_dtypes: 0.033381 sekund


In [22]:
optimized_ddf_n_largest = get_n_largest(optimized_ddf_parquet, n, 'likes')

Czas wczytywania get_n_largest: 0.004635 sekund


In [23]:
optimized_ddf_filtered_by_date = filter_by_date(optimized_ddf_parquet, date_range)

Czas wczytywania filter_by_date: 0.001104 sekund


### Spora różnica na plus dla zoptymalizowanych typów po optymalizacji

## Zadanie 6

In [26]:
import dask.array as da
import time

def measure_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = (end_time - start_time) / num_iterations
        return result, elapsed_time
    return wrapper

size = (20_000, 20_000)
num_iterations = 10
chunk_sizes = [None, (1000, 1000), (2000, 2000), (4000, 4000), (5000, 5000), (10000, 10000)]

@measure_time
def calculate_mean(chunk_size):
    if not chunk_size:
        darr = da.random.normal(5, 0.2, size=size)
    else:
        darr = da.random.normal(5, 0.2, size=size, chunks=chunk_size)
    
    for _ in range(num_iterations):
        average = darr.mean().compute()
    
    return average

results = {}

for chunk_size in chunk_sizes:
    avg, avg_time = calculate_mean(chunk_size)
    results[chunk_size] = avg_time
    print(f"Czas dla chunk size {chunk_size}: {avg_time:.4f} sekundy")

print("\nWyniki:")
for chunk_size, elapsed in results.items():
    print(f"Chunk size: {chunk_size}, średni czas: {elapsed:.4f} sekundy")


Czas dla chunk size None: 1.7022 sekundy
Czas dla chunk size (1000, 1000): 2.8031 sekundy
Czas dla chunk size (2000, 2000): 1.1809 sekundy


2024-10-14 14:45:32,393 - distributed.scheduler - ERROR - Couldn't gather keys: {('mean_agg-aggregate-1522f3ecbeefe5fafb99ec34db62414f',): 'waiting'}
2024-10-14 14:45:34,779 - distributed.scheduler - ERROR - Couldn't gather keys: {('mean_agg-aggregate-1522f3ecbeefe5fafb99ec34db62414f',): 'waiting'}


Czas dla chunk size (4000, 4000): 0.9713 sekundy


2024-10-14 14:45:44,342 - distributed.scheduler - ERROR - Couldn't gather keys: {('mean_agg-aggregate-726fcf69c6bf75de8ea78d26df01b096',): 'waiting'}


Czas dla chunk size (5000, 5000): 1.6788 sekundy
Czas dla chunk size (10000, 10000): 1.2123 sekundy

Wyniki:
Chunk size: None, średni czas: 1.7022 sekundy
Chunk size: (1000, 1000), średni czas: 2.8031 sekundy
Chunk size: (2000, 2000), średni czas: 1.1809 sekundy
Chunk size: (4000, 4000), średni czas: 0.9713 sekundy
Chunk size: (5000, 5000), średni czas: 1.6788 sekundy
Chunk size: (10000, 10000), średni czas: 1.2123 sekundy
